github.com/ooni/psiphon/tunnel-core@v0.0.0-20230105123940-fe12a24c96ee/oovendor/bolt/db.go (about)

     1  package bolt
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"hash/fnv"
     7  	"log"
     8  	"os"
     9  	"runtime"
    10  	"sort"
    11  	"sync"
    12  	"time"
    13  	"unsafe"
    14  )
    15  
    16  // The largest step that can be taken when remapping the mmap.
    17  const maxMmapStep = 1 << 30 // 1GB
    18  
    19  // The data file format version.
    20  const version = 2
    21  
    22  // Represents a marker value to indicate that a file is a Bolt DB.
    23  const magic uint32 = 0xED0CDAED
    24  
    25  const pgidNoFreelist pgid = 0xffffffffffffffff
    26  
    27  // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
    28  // syncing changes to a file.  This is required as some operating systems,
    29  // such as OpenBSD, do not have a unified buffer cache (UBC) and writes
    30  // must be synchronized using the msync(2) syscall.
    31  const IgnoreNoSync = runtime.GOOS == "openbsd"
    32  
    33  // Default values if not set in a DB instance.
    34  const (
    35  	DefaultMaxBatchSize  int = 1000
    36  	DefaultMaxBatchDelay     = 10 * time.Millisecond
    37  	DefaultAllocSize         = 16 * 1024 * 1024
    38  )
    39  
    40  // default page size for db is set to the OS page size.
    41  var defaultPageSize = os.Getpagesize()
    42  
    43  // The time elapsed between consecutive file locking attempts.
    44  const flockRetryTimeout = 50 * time.Millisecond
    45  
    46  // FreelistType is the type of the freelist backend
    47  type FreelistType string
    48  
    49  const (
    50  	// FreelistArrayType indicates backend freelist type is array
    51  	FreelistArrayType = FreelistType("array")
    52  	// FreelistMapType indicates backend freelist type is hashmap
    53  	FreelistMapType = FreelistType("hashmap")
    54  )
    55  
    56  // DB represents a collection of buckets persisted to a file on disk.
    57  // All data access is performed through transactions which can be obtained through the DB.
    58  // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
    59  type DB struct {
    60  	// When enabled, the database will perform a Check() after every commit.
    61  	// A panic is issued if the database is in an inconsistent state. This
    62  	// flag has a large performance impact so it should only be used for
    63  	// debugging purposes.
    64  	StrictMode bool
    65  
    66  	// Setting the NoSync flag will cause the database to skip fsync()
    67  	// calls after each commit. This can be useful when bulk loading data
    68  	// into a database and you can restart the bulk load in the event of
    69  	// a system failure or database corruption. Do not set this flag for
    70  	// normal use.
    71  	//
    72  	// If the package global IgnoreNoSync constant is true, this value is
    73  	// ignored.  See the comment on that constant for more details.
    74  	//
    75  	// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
    76  	NoSync bool
    77  
    78  	// When true, skips syncing freelist to disk. This improves the database
    79  	// write performance under normal operation, but requires a full database
    80  	// re-sync during recovery.
    81  	NoFreelistSync bool
    82  
    83  	// FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
    84  	// dramatic performance degradation if database is large and framentation in freelist is common.
    85  	// The alternative one is using hashmap, it is faster in almost all circumstances
    86  	// but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
    87  	// The default type is array
    88  	FreelistType FreelistType
    89  
    90  	// When true, skips the truncate call when growing the database.
    91  	// Setting this to true is only safe on non-ext3/ext4 systems.
    92  	// Skipping truncation avoids preallocation of hard drive space and
    93  	// bypasses a truncate() and fsync() syscall on remapping.
    94  	//
    95  	// https://github.com/boltdb/bolt/issues/284
    96  	NoGrowSync bool
    97  
    98  	// If you want to read the entire database fast, you can set MmapFlag to
    99  	// syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
   100  	MmapFlags int
   101  
   102  	// MaxBatchSize is the maximum size of a batch. Default value is
   103  	// copied from DefaultMaxBatchSize in Open.
   104  	//
   105  	// If <=0, disables batching.
   106  	//
   107  	// Do not change concurrently with calls to Batch.
   108  	MaxBatchSize int
   109  
   110  	// MaxBatchDelay is the maximum delay before a batch starts.
   111  	// Default value is copied from DefaultMaxBatchDelay in Open.
   112  	//
   113  	// If <=0, effectively disables batching.
   114  	//
   115  	// Do not change concurrently with calls to Batch.
   116  	MaxBatchDelay time.Duration
   117  
   118  	// AllocSize is the amount of space allocated when the database
   119  	// needs to create new pages. This is done to amortize the cost
   120  	// of truncate() and fsync() when growing the data file.
   121  	AllocSize int
   122  
   123  	path     string
   124  	openFile func(string, int, os.FileMode) (*os.File, error)
   125  	file     *os.File
   126  	dataref  []byte // mmap'ed readonly, write throws SEGV
   127  	data     *[maxMapSize]byte
   128  	datasz   int
   129  	filesz   int // current on disk file size
   130  	meta0    *meta
   131  	meta1    *meta
   132  	pageSize int
   133  	opened   bool
   134  	rwtx     *Tx
   135  	txs      []*Tx
   136  	stats    Stats
   137  
   138  	// [Psiphon]
   139  	// https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
   140  	mmapErr error // set on mmap failure; subsequently returned by all methods
   141  
   142  	freelist     *freelist
   143  	freelistLoad sync.Once
   144  
   145  	pagePool sync.Pool
   146  
   147  	batchMu sync.Mutex
   148  	batch   *batch
   149  
   150  	rwlock   sync.Mutex   // Allows only one writer at a time.
   151  	metalock sync.Mutex   // Protects meta page access.
   152  	mmaplock sync.RWMutex // Protects mmap access during remapping.
   153  	statlock sync.RWMutex // Protects stats access.
   154  
   155  	ops struct {
   156  		writeAt func(b []byte, off int64) (n int, err error)
   157  	}
   158  
   159  	// Read only mode.
   160  	// When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
   161  	readOnly bool
   162  }
   163  
   164  // Path returns the path to currently open database file.
   165  func (db *DB) Path() string {
   166  	return db.path
   167  }
   168  
   169  // GoString returns the Go string representation of the database.
   170  func (db *DB) GoString() string {
   171  	return fmt.Sprintf("bolt.DB{path:%q}", db.path)
   172  }
   173  
   174  // String returns the string representation of the database.
   175  func (db *DB) String() string {
   176  	return fmt.Sprintf("DB<%q>", db.path)
   177  }
   178  
   179  // Open creates and opens a database at the given path.
   180  // If the file does not exist then it will be created automatically.
   181  // Passing in nil options will cause Bolt to open the database with the default options.
   182  func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
   183  	db := &DB{
   184  		opened: true,
   185  	}
   186  
   187  	// [Psiphon]
   188  	// Ensure cleanup on panic so recovery can reset a locked file.
   189  	defer func() {
   190  		if r := recover(); r != nil {
   191  			_ = db.close()
   192  			panic(r)
   193  		}
   194  	}()
   195  
   196  	// Set default options if no options are provided.
   197  	if options == nil {
   198  		options = DefaultOptions
   199  	}
   200  	db.NoSync = options.NoSync
   201  	db.NoGrowSync = options.NoGrowSync
   202  	db.MmapFlags = options.MmapFlags
   203  	db.NoFreelistSync = options.NoFreelistSync
   204  	db.FreelistType = options.FreelistType
   205  
   206  	// Set default values for later DB operations.
   207  	db.MaxBatchSize = DefaultMaxBatchSize
   208  	db.MaxBatchDelay = DefaultMaxBatchDelay
   209  	db.AllocSize = DefaultAllocSize
   210  
   211  	flag := os.O_RDWR
   212  	if options.ReadOnly {
   213  		flag = os.O_RDONLY
   214  		db.readOnly = true
   215  	}
   216  
   217  	db.openFile = options.OpenFile
   218  	if db.openFile == nil {
   219  		db.openFile = os.OpenFile
   220  	}
   221  
   222  	// Open data file and separate sync handler for metadata writes.
   223  	var err error
   224  	if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil {
   225  		_ = db.close()
   226  		return nil, err
   227  	}
   228  	db.path = db.file.Name()
   229  
   230  	// Lock file so that other processes using Bolt in read-write mode cannot
   231  	// use the database  at the same time. This would cause corruption since
   232  	// the two processes would write meta pages and free pages separately.
   233  	// The database file is locked exclusively (only one process can grab the lock)
   234  	// if !options.ReadOnly.
   235  	// The database file is locked using the shared lock (more than one process may
   236  	// hold a lock at the same time) otherwise (options.ReadOnly is set).
   237  	if err := flock(db, !db.readOnly, options.Timeout); err != nil {
   238  		_ = db.close()
   239  		return nil, err
   240  	}
   241  
   242  	// Default values for test hooks
   243  	db.ops.writeAt = db.file.WriteAt
   244  
   245  	if db.pageSize = options.PageSize; db.pageSize == 0 {
   246  		// Set the default page size to the OS page size.
   247  		db.pageSize = defaultPageSize
   248  	}
   249  
   250  	// Initialize the database if it doesn't exist.
   251  	if info, err := db.file.Stat(); err != nil {
   252  		_ = db.close()
   253  		return nil, err
   254  	} else if info.Size() == 0 {
   255  		// Initialize new files with meta pages.
   256  		if err := db.init(); err != nil {
   257  			// clean up file descriptor on initialization fail
   258  			_ = db.close()
   259  			return nil, err
   260  		}
   261  	} else {
   262  		// Read the first meta page to determine the page size.
   263  		var buf [0x1000]byte
   264  		// If we can't read the page size, but can read a page, assume
   265  		// it's the same as the OS or one given -- since that's how the
   266  		// page size was chosen in the first place.
   267  		//
   268  		// If the first page is invalid and this OS uses a different
   269  		// page size than what the database was created with then we
   270  		// are out of luck and cannot access the database.
   271  		//
   272  		// TODO: scan for next page
   273  		if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
   274  			if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
   275  				db.pageSize = int(m.pageSize)
   276  			}
   277  		} else {
   278  			_ = db.close()
   279  			return nil, ErrInvalid
   280  		}
   281  	}
   282  
   283  	// Initialize page pool.
   284  	db.pagePool = sync.Pool{
   285  		New: func() interface{} {
   286  			return make([]byte, db.pageSize)
   287  		},
   288  	}
   289  
   290  	// Memory map the data file.
   291  	if err := db.mmap(options.InitialMmapSize); err != nil {
   292  		_ = db.close()
   293  		return nil, err
   294  	}
   295  
   296  	if db.readOnly {
   297  		return db, nil
   298  	}
   299  
   300  	db.loadFreelist()
   301  
   302  	// Flush freelist when transitioning from no sync to sync so
   303  	// NoFreelistSync unaware boltdb can open the db later.
   304  	if !db.NoFreelistSync && !db.hasSyncedFreelist() {
   305  		tx, err := db.Begin(true)
   306  		if tx != nil {
   307  			err = tx.Commit()
   308  		}
   309  		if err != nil {
   310  			_ = db.close()
   311  			return nil, err
   312  		}
   313  	}
   314  
   315  	// Mark the database as opened and return.
   316  	return db, nil
   317  }
   318  
   319  // loadFreelist reads the freelist if it is synced, or reconstructs it
   320  // by scanning the DB if it is not synced. It assumes there are no
   321  // concurrent accesses being made to the freelist.
   322  func (db *DB) loadFreelist() {
   323  	db.freelistLoad.Do(func() {
   324  		db.freelist = newFreelist(db.FreelistType)
   325  		if !db.hasSyncedFreelist() {
   326  			// Reconstruct free list by scanning the DB.
   327  			db.freelist.readIDs(db.freepages())
   328  		} else {
   329  			// Read free list from freelist page.
   330  			db.freelist.read(db.page(db.meta().freelist))
   331  		}
   332  		db.stats.FreePageN = db.freelist.free_count()
   333  	})
   334  }
   335  
   336  func (db *DB) hasSyncedFreelist() bool {
   337  	return db.meta().freelist != pgidNoFreelist
   338  }
   339  
   340  // mmap opens the underlying memory-mapped file and initializes the meta references.
   341  // minsz is the minimum size that the new mmap can be.
   342  func (db *DB) mmap(minsz int) error {
   343  	db.mmaplock.Lock()
   344  	defer db.mmaplock.Unlock()
   345  
   346  	info, err := db.file.Stat()
   347  	if err != nil {
   348  		return fmt.Errorf("mmap stat error: %s", err)
   349  	} else if int(info.Size()) < db.pageSize*2 {
   350  		return fmt.Errorf("file size too small")
   351  	}
   352  
   353  	// Ensure the size is at least the minimum size.
   354  	var size = int(info.Size())
   355  	if size < minsz {
   356  		size = minsz
   357  	}
   358  	size, err = db.mmapSize(size)
   359  	if err != nil {
   360  		return err
   361  	}
   362  
   363  	// Dereference all mmap references before unmapping.
   364  	if db.rwtx != nil {
   365  		db.rwtx.root.dereference()
   366  	}
   367  
   368  	// Unmap existing data before continuing.
   369  	if err := db.munmap(); err != nil {
   370  		return err
   371  	}
   372  
   373  	// Memory-map the data file as a byte slice.
   374  	if err := mmap(db, size); err != nil {
   375  
   376  		// [Psiphon]
   377  		// https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
   378  		// If mmap fails, we cannot safely continue. Mark the db as unusable,
   379  		// causing all future calls to return the mmap error.
   380  		db.mmapErr = MmapError(err.Error())
   381  		return db.mmapErr
   382  	}
   383  
   384  	// Save references to the meta pages.
   385  	db.meta0 = db.page(0).meta()
   386  	db.meta1 = db.page(1).meta()
   387  
   388  	// Validate the meta pages. We only return an error if both meta pages fail
   389  	// validation, since meta0 failing validation means that it wasn't saved
   390  	// properly -- but we can recover using meta1. And vice-versa.
   391  	err0 := db.meta0.validate()
   392  	err1 := db.meta1.validate()
   393  	if err0 != nil && err1 != nil {
   394  		return err0
   395  	}
   396  
   397  	return nil
   398  }
   399  
   400  // munmap unmaps the data file from memory.
   401  func (db *DB) munmap() error {
   402  	if err := munmap(db); err != nil {
   403  		return fmt.Errorf("unmap error: " + err.Error())
   404  	}
   405  	return nil
   406  }
   407  
   408  // mmapSize determines the appropriate size for the mmap given the current size
   409  // of the database. The minimum size is 32KB and doubles until it reaches 1GB.
   410  // Returns an error if the new mmap size is greater than the max allowed.
   411  func (db *DB) mmapSize(size int) (int, error) {
   412  	// Double the size from 32KB until 1GB.
   413  	for i := uint(15); i <= 30; i++ {
   414  		if size <= 1<<i {
   415  			return 1 << i, nil
   416  		}
   417  	}
   418  
   419  	// Verify the requested size is not above the maximum allowed.
   420  	if size > maxMapSize {
   421  		return 0, fmt.Errorf("mmap too large")
   422  	}
   423  
   424  	// If larger than 1GB then grow by 1GB at a time.
   425  	sz := int64(size)
   426  	if remainder := sz % int64(maxMmapStep); remainder > 0 {
   427  		sz += int64(maxMmapStep) - remainder
   428  	}
   429  
   430  	// Ensure that the mmap size is a multiple of the page size.
   431  	// This should always be true since we're incrementing in MBs.
   432  	pageSize := int64(db.pageSize)
   433  	if (sz % pageSize) != 0 {
   434  		sz = ((sz / pageSize) + 1) * pageSize
   435  	}
   436  
   437  	// If we've exceeded the max size then only grow up to the max size.
   438  	if sz > maxMapSize {
   439  		sz = maxMapSize
   440  	}
   441  
   442  	return int(sz), nil
   443  }
   444  
   445  // init creates a new database file and initializes its meta pages.
   446  func (db *DB) init() error {
   447  	// Create two meta pages on a buffer.
   448  	buf := make([]byte, db.pageSize*4)
   449  	for i := 0; i < 2; i++ {
   450  		p := db.pageInBuffer(buf[:], pgid(i))
   451  		p.id = pgid(i)
   452  		p.flags = metaPageFlag
   453  
   454  		// Initialize the meta page.
   455  		m := p.meta()
   456  		m.magic = magic
   457  		m.version = version
   458  		m.pageSize = uint32(db.pageSize)
   459  		m.freelist = 2
   460  		m.root = bucket{root: 3}
   461  		m.pgid = 4
   462  		m.txid = txid(i)
   463  		m.checksum = m.sum64()
   464  	}
   465  
   466  	// Write an empty freelist at page 3.
   467  	p := db.pageInBuffer(buf[:], pgid(2))
   468  	p.id = pgid(2)
   469  	p.flags = freelistPageFlag
   470  	p.count = 0
   471  
   472  	// Write an empty leaf page at page 4.
   473  	p = db.pageInBuffer(buf[:], pgid(3))
   474  	p.id = pgid(3)
   475  	p.flags = leafPageFlag
   476  	p.count = 0
   477  
   478  	// Write the buffer to our data file.
   479  	if _, err := db.ops.writeAt(buf, 0); err != nil {
   480  		return err
   481  	}
   482  	if err := fdatasync(db); err != nil {
   483  		return err
   484  	}
   485  
   486  	return nil
   487  }
   488  
   489  // Close releases all database resources.
   490  // It will block waiting for any open transactions to finish
   491  // before closing the database and returning.
   492  func (db *DB) Close() error {
   493  	db.rwlock.Lock()
   494  	defer db.rwlock.Unlock()
   495  
   496  	db.metalock.Lock()
   497  	defer db.metalock.Unlock()
   498  
   499  	db.mmaplock.Lock()
   500  	defer db.mmaplock.Unlock()
   501  
   502  	return db.close()
   503  }
   504  
   505  func (db *DB) close() error {
   506  	if !db.opened {
   507  		return nil
   508  	}
   509  
   510  	db.opened = false
   511  
   512  	db.freelist = nil
   513  
   514  	// Clear ops.
   515  	db.ops.writeAt = nil
   516  
   517  	// Close the mmap.
   518  	if err := db.munmap(); err != nil {
   519  		return err
   520  	}
   521  
   522  	// Close file handles.
   523  	if db.file != nil {
   524  		// No need to unlock read-only file.
   525  		if !db.readOnly {
   526  			// Unlock the file.
   527  			if err := funlock(db); err != nil {
   528  				log.Printf("bolt.Close(): funlock error: %s", err)
   529  			}
   530  		}
   531  
   532  		// Close the file descriptor.
   533  		if err := db.file.Close(); err != nil {
   534  			return fmt.Errorf("db file close: %s", err)
   535  		}
   536  		db.file = nil
   537  	}
   538  
   539  	db.path = ""
   540  	return nil
   541  }
   542  
   543  // Begin starts a new transaction.
   544  // Multiple read-only transactions can be used concurrently but only one
   545  // write transaction can be used at a time. Starting multiple write transactions
   546  // will cause the calls to block and be serialized until the current write
   547  // transaction finishes.
   548  //
   549  // Transactions should not be dependent on one another. Opening a read
   550  // transaction and a write transaction in the same goroutine can cause the
   551  // writer to deadlock because the database periodically needs to re-mmap itself
   552  // as it grows and it cannot do that while a read transaction is open.
   553  //
   554  // If a long running read transaction (for example, a snapshot transaction) is
   555  // needed, you might want to set DB.InitialMmapSize to a large enough value
   556  // to avoid potential blocking of write transaction.
   557  //
   558  // IMPORTANT: You must close read-only transactions after you are finished or
   559  // else the database will not reclaim old pages.
   560  func (db *DB) Begin(writable bool) (*Tx, error) {
   561  	if writable {
   562  		return db.beginRWTx()
   563  	}
   564  	return db.beginTx()
   565  }
   566  
   567  func (db *DB) beginTx() (*Tx, error) {
   568  	// Lock the meta pages while we initialize the transaction. We obtain
   569  	// the meta lock before the mmap lock because that's the order that the
   570  	// write transaction will obtain them.
   571  	db.metalock.Lock()
   572  
   573  	// Obtain a read-only lock on the mmap. When the mmap is remapped it will
   574  	// obtain a write lock so all transactions must finish before it can be
   575  	// remapped.
   576  	db.mmaplock.RLock()
   577  
   578  	// Exit if the database is not open yet.
   579  	if !db.opened {
   580  		db.mmaplock.RUnlock()
   581  		db.metalock.Unlock()
   582  		return nil, ErrDatabaseNotOpen
   583  	}
   584  
   585  	// [Psiphon]
   586  	// https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
   587  	// Return mmap error if a previous mmap failed.
   588  	if db.mmapErr != nil {
   589  		db.mmaplock.RUnlock()
   590  		db.metalock.Unlock()
   591  		return nil, db.mmapErr
   592  	}
   593  
   594  	// Create a transaction associated with the database.
   595  	t := &Tx{}
   596  	t.init(db)
   597  
   598  	// Keep track of transaction until it closes.
   599  	db.txs = append(db.txs, t)
   600  	n := len(db.txs)
   601  
   602  	// Unlock the meta pages.
   603  	db.metalock.Unlock()
   604  
   605  	// Update the transaction stats.
   606  	db.statlock.Lock()
   607  	db.stats.TxN++
   608  	db.stats.OpenTxN = n
   609  	db.statlock.Unlock()
   610  
   611  	return t, nil
   612  }
   613  
   614  func (db *DB) beginRWTx() (*Tx, error) {
   615  	// If the database was opened with Options.ReadOnly, return an error.
   616  	if db.readOnly {
   617  		return nil, ErrDatabaseReadOnly
   618  	}
   619  
   620  	// Obtain writer lock. This is released by the transaction when it closes.
   621  	// This enforces only one writer transaction at a time.
   622  	db.rwlock.Lock()
   623  
   624  	// Once we have the writer lock then we can lock the meta pages so that
   625  	// we can set up the transaction.
   626  	db.metalock.Lock()
   627  	defer db.metalock.Unlock()
   628  
   629  	// Exit if the database is not open yet.
   630  	if !db.opened {
   631  		db.rwlock.Unlock()
   632  		return nil, ErrDatabaseNotOpen
   633  	}
   634  
   635  	// [Psiphon]
   636  	// https://github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
   637  	// Return mmap error if a previous mmap failed.
   638  	if db.mmapErr != nil {
   639  		db.rwlock.Unlock()
   640  		return nil, db.mmapErr
   641  	}
   642  
   643  	// Create a transaction associated with the database.
   644  	t := &Tx{writable: true}
   645  	t.init(db)
   646  	db.rwtx = t
   647  	db.freePages()
   648  	return t, nil
   649  }
   650  
   651  // freePages releases any pages associated with closed read-only transactions.
   652  func (db *DB) freePages() {
   653  	// Free all pending pages prior to earliest open transaction.
   654  	sort.Sort(txsById(db.txs))
   655  	minid := txid(0xFFFFFFFFFFFFFFFF)
   656  	if len(db.txs) > 0 {
   657  		minid = db.txs[0].meta.txid
   658  	}
   659  	if minid > 0 {
   660  		db.freelist.release(minid - 1)
   661  	}
   662  	// Release unused txid extents.
   663  	for _, t := range db.txs {
   664  		db.freelist.releaseRange(minid, t.meta.txid-1)
   665  		minid = t.meta.txid + 1
   666  	}
   667  	db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
   668  	// Any page both allocated and freed in an extent is safe to release.
   669  }
   670  
   671  type txsById []*Tx
   672  
   673  func (t txsById) Len() int           { return len(t) }
   674  func (t txsById) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
   675  func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
   676  
   677  // removeTx removes a transaction from the database.
   678  func (db *DB) removeTx(tx *Tx) {
   679  	// Release the read lock on the mmap.
   680  	db.mmaplock.RUnlock()
   681  
   682  	// Use the meta lock to restrict access to the DB object.
   683  	db.metalock.Lock()
   684  
   685  	// Remove the transaction.
   686  	for i, t := range db.txs {
   687  		if t == tx {
   688  			last := len(db.txs) - 1
   689  			db.txs[i] = db.txs[last]
   690  			db.txs[last] = nil
   691  			db.txs = db.txs[:last]
   692  			break
   693  		}
   694  	}
   695  	n := len(db.txs)
   696  
   697  	// Unlock the meta pages.
   698  	db.metalock.Unlock()
   699  
   700  	// Merge statistics.
   701  	db.statlock.Lock()
   702  	db.stats.OpenTxN = n
   703  	db.stats.TxStats.add(&tx.stats)
   704  	db.statlock.Unlock()
   705  }
   706  
   707  // Update executes a function within the context of a read-write managed transaction.
   708  // If no error is returned from the function then the transaction is committed.
   709  // If an error is returned then the entire transaction is rolled back.
   710  // Any error that is returned from the function or returned from the commit is
   711  // returned from the Update() method.
   712  //
   713  // Attempting to manually commit or rollback within the function will cause a panic.
   714  func (db *DB) Update(fn func(*Tx) error) error {
   715  	t, err := db.Begin(true)
   716  	if err != nil {
   717  		return err
   718  	}
   719  
   720  	// Make sure the transaction rolls back in the event of a panic.
   721  	defer func() {
   722  		if t.db != nil {
   723  			t.rollback()
   724  		}
   725  	}()
   726  
   727  	// Mark as a managed tx so that the inner function cannot manually commit.
   728  	t.managed = true
   729  
   730  	// If an error is returned from the function then rollback and return error.
   731  	err = fn(t)
   732  	t.managed = false
   733  	if err != nil {
   734  		_ = t.Rollback()
   735  		return err
   736  	}
   737  
   738  	return t.Commit()
   739  }
   740  
   741  // View executes a function within the context of a managed read-only transaction.
   742  // Any error that is returned from the function is returned from the View() method.
   743  //
   744  // Attempting to manually rollback within the function will cause a panic.
   745  func (db *DB) View(fn func(*Tx) error) error {
   746  	t, err := db.Begin(false)
   747  	if err != nil {
   748  		return err
   749  	}
   750  
   751  	// Make sure the transaction rolls back in the event of a panic.
   752  	defer func() {
   753  		if t.db != nil {
   754  			t.rollback()
   755  		}
   756  	}()
   757  
   758  	// Mark as a managed tx so that the inner function cannot manually rollback.
   759  	t.managed = true
   760  
   761  	// If an error is returned from the function then pass it through.
   762  	err = fn(t)
   763  	t.managed = false
   764  	if err != nil {
   765  		_ = t.Rollback()
   766  		return err
   767  	}
   768  
   769  	return t.Rollback()
   770  }
   771  
   772  // Batch calls fn as part of a batch. It behaves similar to Update,
   773  // except:
   774  //
   775  // 1. concurrent Batch calls can be combined into a single Bolt
   776  // transaction.
   777  //
   778  // 2. the function passed to Batch may be called multiple times,
   779  // regardless of whether it returns error or not.
   780  //
   781  // This means that Batch function side effects must be idempotent and
   782  // take permanent effect only after a successful return is seen in
   783  // caller.
   784  //
   785  // The maximum batch size and delay can be adjusted with DB.MaxBatchSize
   786  // and DB.MaxBatchDelay, respectively.
   787  //
   788  // Batch is only useful when there are multiple goroutines calling it.
   789  func (db *DB) Batch(fn func(*Tx) error) error {
   790  	errCh := make(chan error, 1)
   791  
   792  	db.batchMu.Lock()
   793  	if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
   794  		// There is no existing batch, or the existing batch is full; start a new one.
   795  		db.batch = &batch{
   796  			db: db,
   797  		}
   798  		db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
   799  	}
   800  	db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
   801  	if len(db.batch.calls) >= db.MaxBatchSize {
   802  		// wake up batch, it's ready to run
   803  		go db.batch.trigger()
   804  	}
   805  	db.batchMu.Unlock()
   806  
   807  	err := <-errCh
   808  	if err == trySolo {
   809  		err = db.Update(fn)
   810  	}
   811  	return err
   812  }
   813  
   814  type call struct {
   815  	fn  func(*Tx) error
   816  	err chan<- error
   817  }
   818  
   819  type batch struct {
   820  	db    *DB
   821  	timer *time.Timer
   822  	start sync.Once
   823  	calls []call
   824  }
   825  
   826  // trigger runs the batch if it hasn't already been run.
   827  func (b *batch) trigger() {
   828  	b.start.Do(b.run)
   829  }
   830  
   831  // run performs the transactions in the batch and communicates results
   832  // back to DB.Batch.
   833  func (b *batch) run() {
   834  	b.db.batchMu.Lock()
   835  	b.timer.Stop()
   836  	// Make sure no new work is added to this batch, but don't break
   837  	// other batches.
   838  	if b.db.batch == b {
   839  		b.db.batch = nil
   840  	}
   841  	b.db.batchMu.Unlock()
   842  
   843  retry:
   844  	for len(b.calls) > 0 {
   845  		var failIdx = -1
   846  		err := b.db.Update(func(tx *Tx) error {
   847  			for i, c := range b.calls {
   848  				if err := safelyCall(c.fn, tx); err != nil {
   849  					failIdx = i
   850  					return err
   851  				}
   852  			}
   853  			return nil
   854  		})
   855  
   856  		if failIdx >= 0 {
   857  			// take the failing transaction out of the batch. it's
   858  			// safe to shorten b.calls here because db.batch no longer
   859  			// points to us, and we hold the mutex anyway.
   860  			c := b.calls[failIdx]
   861  			b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
   862  			// tell the submitter re-run it solo, continue with the rest of the batch
   863  			c.err <- trySolo
   864  			continue retry
   865  		}
   866  
   867  		// pass success, or bolt internal errors, to all callers
   868  		for _, c := range b.calls {
   869  			c.err <- err
   870  		}
   871  		break retry
   872  	}
   873  }
   874  
   875  // trySolo is a special sentinel error value used for signaling that a
   876  // transaction function should be re-run. It should never be seen by
   877  // callers.
   878  var trySolo = errors.New("batch function returned an error and should be re-run solo")
   879  
   880  type panicked struct {
   881  	reason interface{}
   882  }
   883  
   884  func (p panicked) Error() string {
   885  	if err, ok := p.reason.(error); ok {
   886  		return err.Error()
   887  	}
   888  	return fmt.Sprintf("panic: %v", p.reason)
   889  }
   890  
   891  func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
   892  	defer func() {
   893  		if p := recover(); p != nil {
   894  			err = panicked{p}
   895  		}
   896  	}()
   897  	return fn(tx)
   898  }
   899  
   900  // Sync executes fdatasync() against the database file handle.
   901  //
   902  // This is not necessary under normal operation, however, if you use NoSync
   903  // then it allows you to force the database file to sync against the disk.
   904  func (db *DB) Sync() error { return fdatasync(db) }
   905  
   906  // Stats retrieves ongoing performance stats for the database.
   907  // This is only updated when a transaction closes.
   908  func (db *DB) Stats() Stats {
   909  	db.statlock.RLock()
   910  	defer db.statlock.RUnlock()
   911  	return db.stats
   912  }
   913  
   914  // This is for internal access to the raw data bytes from the C cursor, use
   915  // carefully, or not at all.
   916  func (db *DB) Info() *Info {
   917  	return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
   918  }
   919  
   920  // page retrieves a page reference from the mmap based on the current page size.
   921  func (db *DB) page(id pgid) *page {
   922  	pos := id * pgid(db.pageSize)
   923  	return (*page)(unsafe.Pointer(&db.data[pos]))
   924  }
   925  
   926  // pageInBuffer retrieves a page reference from a given byte array based on the current page size.
   927  func (db *DB) pageInBuffer(b []byte, id pgid) *page {
   928  	return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
   929  }
   930  
   931  // meta retrieves the current meta page reference.
   932  func (db *DB) meta() *meta {
   933  	// We have to return the meta with the highest txid which doesn't fail
   934  	// validation. Otherwise, we can cause errors when in fact the database is
   935  	// in a consistent state. metaA is the one with the higher txid.
   936  	metaA := db.meta0
   937  	metaB := db.meta1
   938  	if db.meta1.txid > db.meta0.txid {
   939  		metaA = db.meta1
   940  		metaB = db.meta0
   941  	}
   942  
   943  	// Use higher meta page if valid. Otherwise fallback to previous, if valid.
   944  	if err := metaA.validate(); err == nil {
   945  		return metaA
   946  	} else if err := metaB.validate(); err == nil {
   947  		return metaB
   948  	}
   949  
   950  	// This should never be reached, because both meta1 and meta0 were validated
   951  	// on mmap() and we do fsync() on every write.
   952  	panic("bolt.DB.meta(): invalid meta pages")
   953  }
   954  
   955  // allocate returns a contiguous block of memory starting at a given page.
   956  func (db *DB) allocate(txid txid, count int) (*page, error) {
   957  	// Allocate a temporary buffer for the page.
   958  	var buf []byte
   959  	if count == 1 {
   960  		buf = db.pagePool.Get().([]byte)
   961  	} else {
   962  		buf = make([]byte, count*db.pageSize)
   963  	}
   964  	p := (*page)(unsafe.Pointer(&buf[0]))
   965  	p.overflow = uint32(count - 1)
   966  
   967  	// Use pages from the freelist if they are available.
   968  	if p.id = db.freelist.allocate(txid, count); p.id != 0 {
   969  		return p, nil
   970  	}
   971  
   972  	// Resize mmap() if we're at the end.
   973  	p.id = db.rwtx.meta.pgid
   974  	var minsz = int((p.id+pgid(count))+1) * db.pageSize
   975  	if minsz >= db.datasz {
   976  		if err := db.mmap(minsz); err != nil {
   977  			return nil, fmt.Errorf("mmap allocate error: %s", err)
   978  		}
   979  	}
   980  
   981  	// Move the page id high water mark.
   982  	db.rwtx.meta.pgid += pgid(count)
   983  
   984  	return p, nil
   985  }
   986  
   987  // grow grows the size of the database to the given sz.
   988  func (db *DB) grow(sz int) error {
   989  	// Ignore if the new size is less than available file size.
   990  	if sz <= db.filesz {
   991  		return nil
   992  	}
   993  
   994  	// If the data is smaller than the alloc size then only allocate what's needed.
   995  	// Once it goes over the allocation size then allocate in chunks.
   996  	if db.datasz < db.AllocSize {
   997  		sz = db.datasz
   998  	} else {
   999  		sz += db.AllocSize
  1000  	}
  1001  
  1002  	// Truncate and fsync to ensure file size metadata is flushed.
  1003  	// https://github.com/boltdb/bolt/issues/284
  1004  	if !db.NoGrowSync && !db.readOnly {
  1005  		if runtime.GOOS != "windows" {
  1006  			if err := db.file.Truncate(int64(sz)); err != nil {
  1007  				return fmt.Errorf("file resize error: %s", err)
  1008  			}
  1009  		}
  1010  		if err := db.file.Sync(); err != nil {
  1011  			return fmt.Errorf("file sync error: %s", err)
  1012  		}
  1013  	}
  1014  
  1015  	db.filesz = sz
  1016  	return nil
  1017  }
  1018  
  1019  func (db *DB) IsReadOnly() bool {
  1020  	return db.readOnly
  1021  }
  1022  
  1023  func (db *DB) freepages() []pgid {
  1024  	tx, err := db.beginTx()
  1025  	defer func() {
  1026  		err = tx.Rollback()
  1027  		if err != nil {
  1028  			panic("freepages: failed to rollback tx")
  1029  		}
  1030  	}()
  1031  	if err != nil {
  1032  		panic("freepages: failed to open read only tx")
  1033  	}
  1034  
  1035  	reachable := make(map[pgid]*page)
  1036  	nofreed := make(map[pgid]bool)
  1037  
  1038  	// [Psiphon]
  1039  	// Use single-error checkBucket.
  1040  	err = tx.checkBucket(&tx.root, reachable, nofreed)
  1041  	if err != nil {
  1042  		panic(fmt.Sprintf("freepages: failed to get all reachable pages (%s)", err))
  1043  	}
  1044  
  1045  	var fids []pgid
  1046  	for i := pgid(2); i < db.meta().pgid; i++ {
  1047  		if _, ok := reachable[i]; !ok {
  1048  			fids = append(fids, i)
  1049  		}
  1050  	}
  1051  	return fids
  1052  }
  1053  
  1054  // Options represents the options that can be set when opening a database.
  1055  type Options struct {
  1056  	// Timeout is the amount of time to wait to obtain a file lock.
  1057  	// When set to zero it will wait indefinitely. This option is only
  1058  	// available on Darwin and Linux.
  1059  	Timeout time.Duration
  1060  
  1061  	// Sets the DB.NoGrowSync flag before memory mapping the file.
  1062  	NoGrowSync bool
  1063  
  1064  	// Do not sync freelist to disk. This improves the database write performance
  1065  	// under normal operation, but requires a full database re-sync during recovery.
  1066  	NoFreelistSync bool
  1067  
  1068  	// FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
  1069  	// dramatic performance degradation if database is large and framentation in freelist is common.
  1070  	// The alternative one is using hashmap, it is faster in almost all circumstances
  1071  	// but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
  1072  	// The default type is array
  1073  	FreelistType FreelistType
  1074  
  1075  	// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
  1076  	// grab a shared lock (UNIX).
  1077  	ReadOnly bool
  1078  
  1079  	// Sets the DB.MmapFlags flag before memory mapping the file.
  1080  	MmapFlags int
  1081  
  1082  	// InitialMmapSize is the initial mmap size of the database
  1083  	// in bytes. Read transactions won't block write transaction
  1084  	// if the InitialMmapSize is large enough to hold database mmap
  1085  	// size. (See DB.Begin for more information)
  1086  	//
  1087  	// If <=0, the initial map size is 0.
  1088  	// If initialMmapSize is smaller than the previous database size,
  1089  	// it takes no effect.
  1090  	InitialMmapSize int
  1091  
  1092  	// PageSize overrides the default OS page size.
  1093  	PageSize int
  1094  
  1095  	// NoSync sets the initial value of DB.NoSync. Normally this can just be
  1096  	// set directly on the DB itself when returned from Open(), but this option
  1097  	// is useful in APIs which expose Options but not the underlying DB.
  1098  	NoSync bool
  1099  
  1100  	// OpenFile is used to open files. It defaults to os.OpenFile. This option
  1101  	// is useful for writing hermetic tests.
  1102  	OpenFile func(string, int, os.FileMode) (*os.File, error)
  1103  }
  1104  
  1105  // DefaultOptions represent the options used if nil options are passed into Open().
  1106  // No timeout is used which will cause Bolt to wait indefinitely for a lock.
  1107  var DefaultOptions = &Options{
  1108  	Timeout:      0,
  1109  	NoGrowSync:   false,
  1110  	FreelistType: FreelistArrayType,
  1111  }
  1112  
  1113  // Stats represents statistics about the database.
  1114  type Stats struct {
  1115  	// Freelist stats
  1116  	FreePageN     int // total number of free pages on the freelist
  1117  	PendingPageN  int // total number of pending pages on the freelist
  1118  	FreeAlloc     int // total bytes allocated in free pages
  1119  	FreelistInuse int // total bytes used by the freelist
  1120  
  1121  	// Transaction stats
  1122  	TxN     int // total number of started read transactions
  1123  	OpenTxN int // number of currently open read transactions
  1124  
  1125  	TxStats TxStats // global, ongoing stats.
  1126  }
  1127  
  1128  // Sub calculates and returns the difference between two sets of database stats.
  1129  // This is useful when obtaining stats at two different points and time and
  1130  // you need the performance counters that occurred within that time span.
  1131  func (s *Stats) Sub(other *Stats) Stats {
  1132  	if other == nil {
  1133  		return *s
  1134  	}
  1135  	var diff Stats
  1136  	diff.FreePageN = s.FreePageN
  1137  	diff.PendingPageN = s.PendingPageN
  1138  	diff.FreeAlloc = s.FreeAlloc
  1139  	diff.FreelistInuse = s.FreelistInuse
  1140  	diff.TxN = s.TxN - other.TxN
  1141  	diff.TxStats = s.TxStats.Sub(&other.TxStats)
  1142  	return diff
  1143  }
  1144  
  1145  type Info struct {
  1146  	Data     uintptr
  1147  	PageSize int
  1148  }
  1149  
  1150  type meta struct {
  1151  	magic    uint32
  1152  	version  uint32
  1153  	pageSize uint32
  1154  	flags    uint32
  1155  	root     bucket
  1156  	freelist pgid
  1157  	pgid     pgid
  1158  	txid     txid
  1159  	checksum uint64
  1160  }
  1161  
  1162  // validate checks the marker bytes and version of the meta page to ensure it matches this binary.
  1163  func (m *meta) validate() error {
  1164  	if m.magic != magic {
  1165  		return ErrInvalid
  1166  	} else if m.version != version {
  1167  		return ErrVersionMismatch
  1168  	} else if m.checksum != 0 && m.checksum != m.sum64() {
  1169  		return ErrChecksum
  1170  	}
  1171  	return nil
  1172  }
  1173  
  1174  // copy copies one meta object to another.
  1175  func (m *meta) copy(dest *meta) {
  1176  	*dest = *m
  1177  }
  1178  
  1179  // write writes the meta onto a page.
  1180  func (m *meta) write(p *page) {
  1181  	if m.root.root >= m.pgid {
  1182  		panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
  1183  	} else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
  1184  		// TODO: reject pgidNoFreeList if !NoFreelistSync
  1185  		panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
  1186  	}
  1187  
  1188  	// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
  1189  	p.id = pgid(m.txid % 2)
  1190  	p.flags |= metaPageFlag
  1191  
  1192  	// Calculate the checksum.
  1193  	m.checksum = m.sum64()
  1194  
  1195  	m.copy(p.meta())
  1196  }
  1197  
  1198  // generates the checksum for the meta.
  1199  func (m *meta) sum64() uint64 {
  1200  	var h = fnv.New64a()
  1201  	_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
  1202  	return h.Sum64()
  1203  }
  1204  
  1205  // _assert will panic with a given formatted message if the given condition is false.
  1206  func _assert(condition bool, msg string, v ...interface{}) {
  1207  	if !condition {
  1208  		panic(fmt.Sprintf("assertion failed: "+msg, v...))
  1209  	}
  1210  }