github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/bitree/bdb/tx.go (about)

     1  // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bdb
    16  
    17  import (
    18  	"io"
    19  	"os"
    20  	"sort"
    21  	"strings"
    22  	"sync/atomic"
    23  	"time"
    24  	"unsafe"
    25  
    26  	"github.com/cockroachdb/errors"
    27  )
    28  
    29  type txid uint64
    30  
    31  type ReadTx struct {
    32  	ref atomic.Int32
    33  	tx  *Tx
    34  	bkt *Bucket
    35  	bdb *DB
    36  }
    37  
    38  func (rt *ReadTx) Init(tx *Tx, bkt *Bucket, bdb *DB) {
    39  	rt.ref.Store(1)
    40  	rt.tx = tx
    41  	rt.bkt = bkt
    42  	rt.bdb = bdb
    43  }
    44  
    45  func (rt *ReadTx) Bucket() *Bucket {
    46  	return rt.bkt
    47  }
    48  
    49  func (rt *ReadTx) Ref() {
    50  	rt.ref.Add(1)
    51  }
    52  
    53  func (rt *ReadTx) Unref(update bool) (err error) {
    54  	if rt.ref.Add(-1) == 0 {
    55  		err = rt.tx.Rollback()
    56  		if update {
    57  			err = rt.bdb.Update(func(tx *Tx) error { return nil })
    58  		}
    59  	}
    60  	return err
    61  }
    62  
    63  type Tx struct {
    64  	writable       bool
    65  	managed        bool
    66  	db             *DB
    67  	meta           *meta
    68  	root           Bucket
    69  	pages          map[pgid]*page
    70  	stats          TxStats
    71  	commitHandlers []func()
    72  	WriteFlag      int
    73  }
    74  
    75  func (tx *Tx) init(db *DB) {
    76  	tx.db = db
    77  	tx.pages = nil
    78  
    79  	tx.meta = &meta{}
    80  	db.meta().copy(tx.meta)
    81  
    82  	tx.root = newBucket(tx)
    83  	tx.root.bucket = &bucket{}
    84  	*tx.root.bucket = tx.meta.root
    85  
    86  	if tx.writable {
    87  		tx.pages = make(map[pgid]*page, 1<<4)
    88  		tx.meta.txid += txid(1)
    89  	}
    90  }
    91  
    92  func (tx *Tx) ID() int {
    93  	return int(tx.meta.txid)
    94  }
    95  
    96  func (tx *Tx) DB() *DB {
    97  	return tx.db
    98  }
    99  
   100  func (tx *Tx) Size() int64 {
   101  	return int64(tx.meta.pgid) * int64(tx.db.pageSize)
   102  }
   103  
   104  func (tx *Tx) Writable() bool {
   105  	return tx.writable
   106  }
   107  
   108  func (tx *Tx) Cursor() *Cursor {
   109  	return tx.root.Cursor()
   110  }
   111  
   112  func (tx *Tx) Stats() TxStats {
   113  	return tx.stats
   114  }
   115  
   116  func (tx *Tx) Bucket(name []byte) *Bucket {
   117  	return tx.root.Bucket(name)
   118  }
   119  
   120  func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) {
   121  	return tx.root.CreateBucket(name)
   122  }
   123  
   124  func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) {
   125  	return tx.root.CreateBucketIfNotExists(name)
   126  }
   127  
   128  func (tx *Tx) DeleteBucket(name []byte) error {
   129  	return tx.root.DeleteBucket(name)
   130  }
   131  
   132  func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error {
   133  	return tx.root.ForEach(func(k, v []byte) error {
   134  		return fn(k, tx.root.Bucket(k))
   135  	})
   136  }
   137  
   138  func (tx *Tx) OnCommit(fn func()) {
   139  	tx.commitHandlers = append(tx.commitHandlers, fn)
   140  }
   141  
   142  func (tx *Tx) Commit() error {
   143  	_assert(!tx.managed, "managed tx commit not allowed")
   144  	if tx.db == nil {
   145  		return ErrTxClosed
   146  	} else if !tx.writable {
   147  		return ErrTxNotWritable
   148  	}
   149  
   150  	var startTime = time.Now()
   151  	tx.root.rebalance()
   152  	if tx.stats.Rebalance > 0 {
   153  		tx.stats.RebalanceTime += time.Since(startTime)
   154  	}
   155  
   156  	startTime = time.Now()
   157  	if err := tx.root.spill(); err != nil {
   158  		tx.rollback()
   159  		return err
   160  	}
   161  	tx.stats.SpillTime += time.Since(startTime)
   162  
   163  	tx.meta.root.root = tx.root.root
   164  
   165  	if tx.meta.freelist != pgidNoFreelist {
   166  		tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
   167  	}
   168  
   169  	if !tx.db.NoFreelistSync {
   170  		if err := tx.commitFreelist(); err != nil {
   171  			return err
   172  		}
   173  	} else {
   174  		tx.meta.freelist = pgidNoFreelist
   175  	}
   176  
   177  	startTime = time.Now()
   178  	if err := tx.write(); err != nil {
   179  		tx.rollback()
   180  		return err
   181  	}
   182  
   183  	if tx.db.StrictMode {
   184  		ch := tx.Check()
   185  		var errs []string
   186  		for {
   187  			err, ok := <-ch
   188  			if !ok {
   189  				break
   190  			}
   191  			errs = append(errs, err.Error())
   192  		}
   193  		if len(errs) > 0 {
   194  			panic("check fail: " + strings.Join(errs, "\n"))
   195  		}
   196  	}
   197  
   198  	if err := tx.writeMeta(); err != nil {
   199  		tx.rollback()
   200  		return err
   201  	}
   202  	tx.stats.WriteTime += time.Since(startTime)
   203  
   204  	tx.close()
   205  
   206  	for _, fn := range tx.commitHandlers {
   207  		fn()
   208  	}
   209  
   210  	return nil
   211  }
   212  
   213  func (tx *Tx) commitFreelist() error {
   214  	opgid := tx.meta.pgid
   215  	rb, freeCount := tx.db.freelist.convertBitmap()
   216  	rbLen := rb.GetSerializedSizeInBytes()
   217  	count := ((int(rbLen) + freelistBitmapHeaderSize) / tx.db.pageSize) + 2
   218  	p, isFreePage, err := tx.allocate(count)
   219  	if err != nil {
   220  		tx.rollback()
   221  		return err
   222  	}
   223  
   224  	if isFreePage {
   225  		for pid := p.id; pid < p.id+pgid(count); pid++ {
   226  			if rb.Contains(uint64(pid)) {
   227  				rb.Remove(uint64(pid))
   228  				freeCount--
   229  			}
   230  		}
   231  		rbLen = rb.GetSerializedSizeInBytes()
   232  	}
   233  
   234  	if freeCount < 0 {
   235  		freeCount = 0
   236  	}
   237  
   238  	if err := tx.db.freelist.writeBitmap(p, rb, rbLen, freeCount); err != nil {
   239  		tx.rollback()
   240  		return err
   241  	}
   242  
   243  	tx.meta.freelist = p.id
   244  	if tx.meta.pgid > opgid {
   245  		if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil {
   246  			tx.rollback()
   247  			return err
   248  		}
   249  	}
   250  
   251  	return nil
   252  }
   253  
   254  func (tx *Tx) Rollback() error {
   255  	_assert(!tx.managed, "managed tx rollback not allowed")
   256  	if tx.db == nil {
   257  		return ErrTxClosed
   258  	}
   259  	tx.nonPhysicalRollback()
   260  	return nil
   261  }
   262  
   263  func (tx *Tx) nonPhysicalRollback() {
   264  	if tx.db == nil {
   265  		return
   266  	}
   267  	if tx.writable {
   268  		tx.db.freelist.rollback(tx.meta.txid)
   269  	}
   270  	tx.close()
   271  }
   272  
   273  func (tx *Tx) rollback() {
   274  	if tx.db == nil {
   275  		return
   276  	}
   277  	if tx.writable {
   278  		tx.db.freelist.rollback(tx.meta.txid)
   279  		if !tx.db.hasSyncedFreelist() {
   280  			tx.db.freelist.noSyncReload(tx.db.freepages())
   281  		} else {
   282  			tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist), tx.db.meta().version)
   283  		}
   284  	}
   285  	tx.close()
   286  }
   287  
   288  func (tx *Tx) close() {
   289  	if tx.db == nil {
   290  		return
   291  	}
   292  	if tx.writable {
   293  		var freelistFreeN = tx.db.freelist.free_count()
   294  		var freelistPendingN = tx.db.freelist.pending_count()
   295  		var freelistAlloc = tx.db.freelist.size()
   296  
   297  		tx.db.rwtx = nil
   298  		tx.db.rwlock.Unlock()
   299  
   300  		tx.db.statlock.Lock()
   301  		tx.db.stats.FreePageN = freelistFreeN
   302  		tx.db.stats.PendingPageN = freelistPendingN
   303  		tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize
   304  		tx.db.stats.FreelistInuse = freelistAlloc
   305  		tx.db.stats.TxStats.add(&tx.stats)
   306  		tx.db.statlock.Unlock()
   307  	} else {
   308  		tx.db.removeTx(tx)
   309  	}
   310  
   311  	tx.db = nil
   312  	tx.meta = nil
   313  	tx.root = Bucket{tx: tx}
   314  	tx.pages = nil
   315  }
   316  
   317  func (tx *Tx) Copy(w io.Writer) error {
   318  	_, err := tx.WriteTo(w)
   319  	return err
   320  }
   321  
   322  func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
   323  	f, err := tx.db.openFile(tx.db.path, os.O_RDONLY|tx.WriteFlag, 0)
   324  	if err != nil {
   325  		return 0, err
   326  	}
   327  	defer func() {
   328  		if cerr := f.Close(); err == nil {
   329  			err = cerr
   330  		}
   331  	}()
   332  
   333  	buf := make([]byte, tx.db.pageSize)
   334  	page := (*page)(unsafe.Pointer(&buf[0]))
   335  	page.flags = metaPageFlag
   336  	*page.meta() = *tx.meta
   337  
   338  	page.id = 0
   339  	page.meta().checksum = page.meta().sum64()
   340  	nn, err := w.Write(buf)
   341  	n += int64(nn)
   342  	if err != nil {
   343  		return n, errors.Wrap(err, "meta 0 copy err")
   344  	}
   345  
   346  	page.id = 1
   347  	page.meta().txid -= 1
   348  	page.meta().checksum = page.meta().sum64()
   349  	nn, err = w.Write(buf)
   350  	n += int64(nn)
   351  	if err != nil {
   352  		return n, errors.Wrap(err, "meta 1 copy err")
   353  	}
   354  
   355  	if _, err := f.Seek(int64(tx.db.pageSize*2), io.SeekStart); err != nil {
   356  		return n, errors.Wrap(err, "seek err")
   357  	}
   358  
   359  	wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2))
   360  	n += wn
   361  	if err != nil {
   362  		return n, err
   363  	}
   364  
   365  	return n, nil
   366  }
   367  
   368  func (tx *Tx) CopyFile(path string, mode os.FileMode) error {
   369  	f, err := tx.db.openFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode)
   370  	if err != nil {
   371  		return err
   372  	}
   373  
   374  	_, err = tx.WriteTo(f)
   375  	if err != nil {
   376  		_ = f.Close()
   377  		return err
   378  	}
   379  	return f.Close()
   380  }
   381  
   382  func (tx *Tx) Check() <-chan error {
   383  	ch := make(chan error)
   384  	go tx.check(ch)
   385  	return ch
   386  }
   387  
   388  func (tx *Tx) check(ch chan error) {
   389  	tx.db.loadFreelist()
   390  
   391  	freed := make(map[pgid]bool, 1<<4)
   392  	all := make([]pgid, tx.db.freelist.count())
   393  	tx.db.freelist.copyall(all)
   394  	for _, id := range all {
   395  		if freed[id] {
   396  			ch <- errors.Errorf("page %d: already freed", id)
   397  		}
   398  		freed[id] = true
   399  	}
   400  
   401  	reachable := make(map[pgid]*page, 1<<4)
   402  	reachable[0] = tx.page(0)
   403  	reachable[1] = tx.page(1)
   404  	if tx.meta.freelist != pgidNoFreelist {
   405  		for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ {
   406  			reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist)
   407  		}
   408  	}
   409  
   410  	tx.checkBucket(&tx.root, reachable, freed, ch)
   411  
   412  	for i := pgid(0); i < tx.meta.pgid; i++ {
   413  		_, isReachable := reachable[i]
   414  		if !isReachable && !freed[i] {
   415  			ch <- errors.Errorf("page %d: unreachable unfreed", int(i))
   416  		}
   417  	}
   418  
   419  	close(ch)
   420  }
   421  
   422  func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, ch chan error) {
   423  	if b.root == 0 {
   424  		return
   425  	}
   426  
   427  	b.tx.forEachPage(b.root, 0, func(p *page, _ int) {
   428  		if p.id > tx.meta.pgid {
   429  			ch <- errors.Errorf("page %d: out of bounds: %d", int(p.id), int(b.tx.meta.pgid))
   430  		}
   431  
   432  		for i := pgid(0); i <= pgid(p.overflow); i++ {
   433  			var id = p.id + i
   434  			if _, ok := reachable[id]; ok {
   435  				ch <- errors.Errorf("page %d: multiple references", int(id))
   436  			}
   437  			reachable[id] = p
   438  		}
   439  
   440  		if freed[p.id] {
   441  			ch <- errors.Errorf("page %d: reachable freed", int(p.id))
   442  		} else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 {
   443  			ch <- errors.Errorf("page %d: invalid type: %s", int(p.id), p.typ())
   444  		}
   445  	})
   446  
   447  	_ = b.ForEach(func(k, v []byte) error {
   448  		if child := b.Bucket(k); child != nil {
   449  			tx.checkBucket(child, reachable, freed, ch)
   450  		}
   451  		return nil
   452  	})
   453  }
   454  
   455  func (tx *Tx) allocate(count int) (*page, bool, error) {
   456  	p, isFreePage, err := tx.db.allocate(tx.meta.txid, count)
   457  	if err != nil {
   458  		return nil, isFreePage, err
   459  	}
   460  
   461  	tx.pages[p.id] = p
   462  
   463  	tx.stats.PageCount += count
   464  	tx.stats.PageAlloc += count * tx.db.pageSize
   465  
   466  	return p, isFreePage, nil
   467  }
   468  
   469  func (tx *Tx) write() error {
   470  	pages := make(pages, 0, len(tx.pages))
   471  	for _, p := range tx.pages {
   472  		pages = append(pages, p)
   473  	}
   474  
   475  	tx.pages = make(map[pgid]*page, 1<<4)
   476  	sort.Sort(pages)
   477  
   478  	for _, p := range pages {
   479  		rem := (uint64(p.overflow) + 1) * uint64(tx.db.pageSize)
   480  		offset := int64(p.id) * int64(tx.db.pageSize)
   481  		var written uintptr
   482  
   483  		for {
   484  			sz := rem
   485  			if sz > maxAllocSize-1 {
   486  				sz = maxAllocSize - 1
   487  			}
   488  			buf := unsafeByteSlice(unsafe.Pointer(p), written, 0, int(sz))
   489  
   490  			if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
   491  				return err
   492  			}
   493  
   494  			tx.stats.Write++
   495  
   496  			rem -= sz
   497  			if rem == 0 {
   498  				break
   499  			}
   500  
   501  			offset += int64(sz)
   502  			written += uintptr(sz)
   503  		}
   504  	}
   505  
   506  	if !tx.db.NoSync || IgnoreNoSync {
   507  		if err := fdatasync(tx.db); err != nil {
   508  			return err
   509  		}
   510  	}
   511  
   512  	for _, p := range pages {
   513  		if int(p.overflow) != 0 {
   514  			continue
   515  		}
   516  
   517  		buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, tx.db.pageSize)
   518  
   519  		for i := range buf {
   520  			buf[i] = 0
   521  		}
   522  		tx.db.pagePool.Put(buf)
   523  	}
   524  
   525  	return nil
   526  }
   527  
   528  func (tx *Tx) writeMeta() (err error) {
   529  	buf := make([]byte, tx.db.pageSize)
   530  	p := tx.db.pageInBuffer(buf, 0)
   531  
   532  	metaOldVersion := tx.meta.version
   533  	if tx.db.version == versionFreelistBitmap && metaOldVersion != versionFreelistBitmap {
   534  		tx.meta.version = versionFreelistBitmap
   535  	}
   536  
   537  	tx.meta.write(p)
   538  
   539  	defer func() {
   540  		if err != nil && tx.meta.version != metaOldVersion {
   541  			tx.meta.version = metaOldVersion
   542  		}
   543  	}()
   544  
   545  	if _, err = tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil {
   546  
   547  		return err
   548  	}
   549  	if !tx.db.NoSync || IgnoreNoSync {
   550  		if err = fdatasync(tx.db); err != nil {
   551  			tx.meta.version = metaOldVersion
   552  			return err
   553  		}
   554  	}
   555  
   556  	tx.stats.Write++
   557  
   558  	return nil
   559  }
   560  
   561  func (tx *Tx) page(id pgid) *page {
   562  	if tx.pages != nil {
   563  		if p, ok := tx.pages[id]; ok {
   564  			return p
   565  		}
   566  	}
   567  
   568  	return tx.db.page(id)
   569  }
   570  
   571  func (tx *Tx) forEachPage(pgid pgid, depth int, fn func(*page, int)) {
   572  	p := tx.page(pgid)
   573  
   574  	fn(p, depth)
   575  
   576  	if (p.flags & branchPageFlag) != 0 {
   577  		for i := 0; i < int(p.count); i++ {
   578  			elem := p.branchPageElement(uint16(i))
   579  			tx.forEachPage(elem.pgid, depth+1, fn)
   580  		}
   581  	}
   582  }
   583  
   584  func (tx *Tx) Page(id int) (*PageInfo, error) {
   585  	if tx.db == nil {
   586  		return nil, ErrTxClosed
   587  	} else if pgid(id) >= tx.meta.pgid {
   588  		return nil, nil
   589  	}
   590  
   591  	p := tx.db.page(pgid(id))
   592  	info := &PageInfo{
   593  		ID:            id,
   594  		Count:         int(p.count),
   595  		OverflowCount: int(p.overflow),
   596  	}
   597  
   598  	if tx.db.freelist.freed(pgid(id)) {
   599  		info.Type = "free"
   600  	} else {
   601  		info.Type = p.typ()
   602  	}
   603  
   604  	return info, nil
   605  }
   606  
   607  type TxStats struct {
   608  	PageCount     int
   609  	PageAlloc     int
   610  	CursorCount   int
   611  	NodeCount     int
   612  	NodeDeref     int
   613  	Rebalance     int
   614  	RebalanceTime time.Duration
   615  	Split         int
   616  	Spill         int
   617  	SpillTime     time.Duration
   618  	Write         int
   619  	WriteTime     time.Duration
   620  }
   621  
   622  func (s *TxStats) add(other *TxStats) {
   623  	s.PageCount += other.PageCount
   624  	s.PageAlloc += other.PageAlloc
   625  	s.CursorCount += other.CursorCount
   626  	s.NodeCount += other.NodeCount
   627  	s.NodeDeref += other.NodeDeref
   628  	s.Rebalance += other.Rebalance
   629  	s.RebalanceTime += other.RebalanceTime
   630  	s.Split += other.Split
   631  	s.Spill += other.Spill
   632  	s.SpillTime += other.SpillTime
   633  	s.Write += other.Write
   634  	s.WriteTime += other.WriteTime
   635  }
   636  
   637  func (s *TxStats) Sub(other *TxStats) TxStats {
   638  	var diff TxStats
   639  	diff.PageCount = s.PageCount - other.PageCount
   640  	diff.PageAlloc = s.PageAlloc - other.PageAlloc
   641  	diff.CursorCount = s.CursorCount - other.CursorCount
   642  	diff.NodeCount = s.NodeCount - other.NodeCount
   643  	diff.NodeDeref = s.NodeDeref - other.NodeDeref
   644  	diff.Rebalance = s.Rebalance - other.Rebalance
   645  	diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime
   646  	diff.Split = s.Split - other.Split
   647  	diff.Spill = s.Spill - other.Spill
   648  	diff.SpillTime = s.SpillTime - other.SpillTime
   649  	diff.Write = s.Write - other.Write
   650  	diff.WriteTime = s.WriteTime - other.WriteTime
   651  	return diff
   652  }