github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/transaction.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package badger
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"math"
    23  	"sort"
    24  	"strconv"
    25  	"sync"
    26  	"sync/atomic"
    27  
    28  	"github.com/dgryski/go-farm"
    29  	"github.com/pingcap/badger/epoch"
    30  	"github.com/pingcap/badger/y"
    31  	"github.com/pingcap/errors"
    32  )
    33  
    34  type oracle struct {
    35  	// curRead must be at the Top for memory alignment. See issue #311.
    36  	curRead   uint64 // Managed by the mutex.
    37  	refCount  int64
    38  	isManaged bool // Does not change value, so no locking required.
    39  
    40  	sync.Mutex
    41  	writeLock  sync.Mutex
    42  	nextCommit uint64
    43  
    44  	// commits stores a key fingerprint and latest commit counter for it.
    45  	// refCount is used to clear out commits map to avoid a memory blowup.
    46  	commits map[uint64]uint64
    47  }
    48  
    49  func (o *oracle) addRef() {
    50  	atomic.AddInt64(&o.refCount, 1)
    51  }
    52  
    53  func (o *oracle) decrRef() {
    54  	if count := atomic.AddInt64(&o.refCount, -1); count == 0 {
    55  		// Clear out commits maps to release memory.
    56  		o.Lock()
    57  		// Avoids the race where something new is added to commitsMap
    58  		// after we check refCount and before we take Lock.
    59  		if atomic.LoadInt64(&o.refCount) != 0 {
    60  			o.Unlock()
    61  			return
    62  		}
    63  		if len(o.commits) >= 1000 { // If the map is still small, let it slide.
    64  			o.commits = make(map[uint64]uint64)
    65  		}
    66  		o.Unlock()
    67  	}
    68  }
    69  
    70  func (o *oracle) readTs() uint64 {
    71  	if o.isManaged {
    72  		return math.MaxUint64
    73  	}
    74  	return atomic.LoadUint64(&o.curRead)
    75  }
    76  
    77  func (o *oracle) commitTs() uint64 {
    78  	o.Lock()
    79  	defer o.Unlock()
    80  	return o.nextCommit
    81  }
    82  
    83  // hasConflict must be called while having a lock.
    84  func (o *oracle) hasConflict(txn *Txn) bool {
    85  	if len(txn.reads) == 0 {
    86  		return false
    87  	}
    88  	for _, ro := range txn.reads {
    89  		if ts, has := o.commits[ro]; has && ts > txn.readTs {
    90  			return true
    91  		}
    92  	}
    93  	return false
    94  }
    95  
    96  func (o *oracle) newCommitTs(txn *Txn) uint64 {
    97  	o.Lock()
    98  	defer o.Unlock()
    99  
   100  	if o.hasConflict(txn) {
   101  		return 0
   102  	}
   103  
   104  	var ts uint64
   105  	if !o.isManaged {
   106  		// This is the general case, when user doesn't specify the read and commit ts.
   107  		ts = o.nextCommit
   108  		o.nextCommit++
   109  
   110  	} else {
   111  		// If commitTs is set, use it instead.
   112  		ts = txn.commitTs
   113  	}
   114  
   115  	for _, w := range txn.writes {
   116  		o.commits[w] = ts // Update the commitTs.
   117  	}
   118  	return ts
   119  }
   120  
   121  func (o *oracle) allocTs() uint64 {
   122  	o.Lock()
   123  	ts := o.nextCommit
   124  	o.nextCommit++
   125  	o.Unlock()
   126  	return ts
   127  }
   128  
   129  func (o *oracle) doneCommit(cts uint64) {
   130  	if o.isManaged {
   131  		// No need to update anything.
   132  		return
   133  	}
   134  
   135  	for {
   136  		curRead := atomic.LoadUint64(&o.curRead)
   137  		if cts <= curRead {
   138  			return
   139  		}
   140  		atomic.CompareAndSwapUint64(&o.curRead, curRead, cts)
   141  	}
   142  }
   143  
   144  // Txn represents a Badger transaction.
   145  type Txn struct {
   146  	readTs   uint64
   147  	commitTs uint64
   148  
   149  	update bool     // update is used to conditionally keep track of reads.
   150  	reads  []uint64 // contains fingerprints of keys read.
   151  	writes []uint64 // contains fingerprints of keys written.
   152  
   153  	pendingWrites map[string]*Entry // cache stores any writes done by txn.
   154  
   155  	db        *DB
   156  	discarded bool
   157  	guard     *epoch.Guard
   158  
   159  	size         int64
   160  	count        int64
   161  	numIterators int32
   162  	blobCache    map[uint32]*blobCache
   163  }
   164  
   165  type pendingWritesIterator struct {
   166  	entries  []*Entry
   167  	nextIdx  int
   168  	readTs   uint64
   169  	reversed bool
   170  }
   171  
   172  func (pi *pendingWritesIterator) Next() {
   173  	pi.nextIdx++
   174  }
   175  
   176  func (pi *pendingWritesIterator) NextVersion() bool {
   177  	// We do not support adding multiple versions in a transaction.
   178  	return false
   179  }
   180  
   181  func (pi *pendingWritesIterator) Rewind() {
   182  	pi.nextIdx = 0
   183  }
   184  
   185  func (pi *pendingWritesIterator) Seek(key []byte) {
   186  	pi.nextIdx = sort.Search(len(pi.entries), func(idx int) bool {
   187  		cmp := bytes.Compare(pi.entries[idx].Key.UserKey, key)
   188  		if !pi.reversed {
   189  			return cmp >= 0
   190  		}
   191  		return cmp <= 0
   192  	})
   193  }
   194  
   195  func (pi *pendingWritesIterator) Key() y.Key {
   196  	y.Assert(pi.Valid())
   197  	entry := pi.entries[pi.nextIdx]
   198  	return y.KeyWithTs(entry.Key.UserKey, pi.readTs)
   199  }
   200  
   201  func (pi *pendingWritesIterator) Value() y.ValueStruct {
   202  	y.Assert(pi.Valid())
   203  	entry := pi.entries[pi.nextIdx]
   204  	return y.ValueStruct{
   205  		Value:    entry.Value,
   206  		Meta:     entry.meta,
   207  		UserMeta: entry.UserMeta,
   208  		Version:  pi.readTs,
   209  	}
   210  }
   211  
   212  func (pi *pendingWritesIterator) FillValue(vs *y.ValueStruct) {
   213  	entry := pi.entries[pi.nextIdx]
   214  	vs.Value = entry.Value
   215  	vs.Meta = entry.meta
   216  	vs.UserMeta = entry.UserMeta
   217  	vs.Version = pi.readTs
   218  }
   219  
   220  func (pi *pendingWritesIterator) Valid() bool {
   221  	return pi.nextIdx < len(pi.entries)
   222  }
   223  
   224  func (pi *pendingWritesIterator) Close() error {
   225  	return nil
   226  }
   227  
   228  func (txn *Txn) newPendingWritesIterator(reversed bool) *pendingWritesIterator {
   229  	if !txn.update || len(txn.pendingWrites) == 0 {
   230  		return nil
   231  	}
   232  	entries := make([]*Entry, 0, len(txn.pendingWrites))
   233  	for _, e := range txn.pendingWrites {
   234  		entries = append(entries, e)
   235  	}
   236  	// Number of pending writes per transaction shouldn't be too big in general.
   237  	sort.Slice(entries, func(i, j int) bool {
   238  		cmp := entries[i].Key.Compare(entries[j].Key)
   239  		if !reversed {
   240  			return cmp < 0
   241  		}
   242  		return cmp > 0
   243  	})
   244  	return &pendingWritesIterator{
   245  		readTs:   txn.readTs,
   246  		entries:  entries,
   247  		reversed: reversed,
   248  	}
   249  }
   250  
   251  func (txn *Txn) checkSize(e *Entry) error {
   252  	if len(e.UserMeta) > 255 {
   253  		return ErrUserMetaTooLarge
   254  	}
   255  	// Extra bytes for version in key.
   256  	size := int64(e.estimateSize()) + 10
   257  	if size >= txn.db.opt.MaxMemTableSize {
   258  		return ErrTxnTooBig
   259  	}
   260  	txn.count++
   261  	txn.size += size
   262  	return nil
   263  }
   264  
   265  // Set adds a key-value pair to the database.
   266  //
   267  // It will return ErrReadOnlyTxn if update flag was set to false when creating the
   268  // transaction.
   269  func (txn *Txn) Set(key, val []byte) error {
   270  	if txn.db.IsManaged() {
   271  		return ErrManagedTxn
   272  	}
   273  	e := &Entry{
   274  		Key:   y.KeyWithTs(key, 0),
   275  		Value: val,
   276  	}
   277  	return txn.SetEntry(e)
   278  }
   279  
   280  // SetWithMeta adds a key-value pair to the database, along with a metadata
   281  // byte. This byte is stored alongside the key, and can be used as an aid to
   282  // interpret the value or store other contextual bits corresponding to the
   283  // key-value pair.
   284  func (txn *Txn) SetWithMeta(key, val []byte, meta byte) error {
   285  	if txn.db.IsManaged() {
   286  		return ErrManagedTxn
   287  	}
   288  	e := &Entry{Key: y.KeyWithTs(key, 0), Value: val, UserMeta: []byte{meta}}
   289  	return txn.SetEntry(e)
   290  }
   291  
   292  func (txn *Txn) SetWithMetaSlice(key, val, meta []byte) error {
   293  	if txn.db.IsManaged() {
   294  		return ErrManagedTxn
   295  	}
   296  	e := &Entry{Key: y.KeyWithTs(key, 0), Value: val, UserMeta: meta}
   297  	return txn.SetEntry(e)
   298  }
   299  
   300  func (txn *Txn) modify(e *Entry) error {
   301  	if !txn.update {
   302  		return ErrReadOnlyTxn
   303  	} else if txn.discarded {
   304  		return ErrDiscardedTxn
   305  	} else if e.Key.IsEmpty() {
   306  		return ErrEmptyKey
   307  	} else if e.Key.Len() > maxKeySize {
   308  		return exceedsMaxKeySizeError(e.Key.UserKey)
   309  	} else if int64(len(e.Value)) > txn.db.opt.ValueLogFileSize {
   310  		return exceedsMaxValueSizeError(e.Value, txn.db.opt.ValueLogFileSize)
   311  	}
   312  	if err := txn.checkSize(e); err != nil {
   313  		return err
   314  	}
   315  
   316  	fp := farm.Fingerprint64(e.Key.UserKey) // Avoid dealing with byte arrays.
   317  	txn.writes = append(txn.writes, fp)
   318  	txn.pendingWrites[string(e.Key.UserKey)] = e
   319  	return nil
   320  }
   321  
   322  // SetEntry takes an Entry struct and adds the key-value pair in the struct, along
   323  // with other metadata to the database.
   324  func (txn *Txn) SetEntry(e *Entry) error {
   325  	return txn.modify(e)
   326  }
   327  
   328  // Delete deletes a key. This is done by adding a delete marker for the key at commit timestamp.
   329  // Any reads happening before this timestamp would be unaffected. Any reads after this commit would
   330  // see the deletion.
   331  func (txn *Txn) Delete(key []byte) error {
   332  	e := &Entry{
   333  		Key:  y.KeyWithTs(key, 0),
   334  		meta: bitDelete,
   335  	}
   336  	return txn.modify(e)
   337  }
   338  
   339  // Get looks for key and returns corresponding Item.
   340  // If key is not found, ErrKeyNotFound is returned.
   341  func (txn *Txn) Get(key []byte) (item *Item, rerr error) {
   342  	if len(key) == 0 {
   343  		return nil, ErrEmptyKey
   344  	} else if txn.discarded {
   345  		return nil, ErrDiscardedTxn
   346  	}
   347  
   348  	item = new(Item)
   349  	if txn.update {
   350  		if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key.UserKey) {
   351  			if isDeleted(e.meta) {
   352  				return nil, ErrKeyNotFound
   353  			}
   354  			// Fulfill from cache.
   355  			item.meta = e.meta
   356  			item.vptr = e.Value
   357  			item.userMeta = e.UserMeta
   358  			item.key.UserKey = key
   359  			item.key.Version = txn.readTs
   360  			// We probably don't need to set db on item here.
   361  			return item, nil
   362  		}
   363  		// Only track reads if this is update txn. No need to track read if txn serviced it
   364  		// internally.
   365  		fp := farm.Fingerprint64(key)
   366  		txn.reads = append(txn.reads, fp)
   367  	}
   368  
   369  	seek := y.KeyWithTs(key, txn.readTs)
   370  	var vs y.ValueStruct
   371  	for {
   372  		vs = txn.db.get(seek)
   373  		if !vs.Valid() {
   374  			return nil, ErrKeyNotFound
   375  		}
   376  		if isDeleted(vs.Meta) {
   377  			return nil, ErrKeyNotFound
   378  		}
   379  		break
   380  	}
   381  
   382  	item.key.UserKey = key
   383  	item.key.Version = vs.Version
   384  	item.meta = vs.Meta
   385  	item.userMeta = vs.UserMeta
   386  	item.db = txn.db
   387  	item.vptr = vs.Value
   388  	item.txn = txn
   389  	return item, nil
   390  }
   391  
   392  type keyValuePair struct {
   393  	key   y.Key
   394  	hash  uint64
   395  	val   y.ValueStruct
   396  	found bool
   397  }
   398  
   399  // MultiGet gets items for keys, if not found, the corresponding item will be nil.
   400  // It only supports read-only transaction for simplicity.
   401  func (txn *Txn) MultiGet(keys [][]byte) (items []*Item, err error) {
   402  	if txn.update {
   403  		return nil, errors.New("not supported")
   404  	}
   405  	if txn.discarded {
   406  		return nil, ErrDiscardedTxn
   407  	}
   408  	keyValuePairs := make([]keyValuePair, len(keys))
   409  	for i, key := range keys {
   410  		if len(key) == 0 {
   411  			return nil, ErrEmptyKey
   412  		}
   413  		keyValuePairs[i].hash = farm.Fingerprint64(key)
   414  		keyValuePairs[i].key = y.KeyWithTs(key, txn.readTs)
   415  	}
   416  	txn.db.multiGet(keyValuePairs)
   417  	items = make([]*Item, len(keys))
   418  	for i, pair := range keyValuePairs {
   419  		if pair.found && !isDeleted(pair.val.Meta) {
   420  			items[i] = &Item{
   421  				key: y.Key{
   422  					UserKey: keys[i],
   423  					Version: pair.val.Version,
   424  				},
   425  				meta:     pair.val.Meta,
   426  				userMeta: pair.val.UserMeta,
   427  				db:       txn.db,
   428  				vptr:     pair.val.Value,
   429  				txn:      txn,
   430  			}
   431  		}
   432  	}
   433  	return items, nil
   434  }
   435  
   436  // Discard discards a created transaction. This method is very important and must be called. Commit
   437  // method calls this internally, however, calling this multiple times doesn't cause any issues. So,
   438  // this can safely be called via a defer right when transaction is created.
   439  //
   440  // NOTE: If any operations are run on a discarded transaction, ErrDiscardedTxn is returned.
   441  func (txn *Txn) Discard() {
   442  	if txn.discarded { // Avoid a re-run.
   443  		return
   444  	}
   445  	if atomic.LoadInt32(&txn.numIterators) > 0 {
   446  		panic("Unclosed iterator at time of Txn.Discard.")
   447  	}
   448  	txn.discarded = true
   449  	txn.blobCache = nil
   450  	if txn.update {
   451  		txn.db.orc.decrRef()
   452  	}
   453  	txn.guard.Done()
   454  }
   455  
   456  // Commit commits the transaction, following these steps:
   457  //
   458  // 1. If there are no writes, return immediately.
   459  //
   460  // 2. Check if read rows were updated since txn started. If so, return ErrConflict.
   461  //
   462  // 3. If no conflict, generate a commit timestamp and update written rows' commit ts.
   463  //
   464  // 4. Batch up all writes, write them to value log and LSM tree.
   465  //
   466  // If error is nil, the transaction is successfully committed. In case of a non-nil error, the LSM
   467  // tree won't be updated, so there's no need for any rollback.
   468  func (txn *Txn) Commit() error {
   469  	if txn.discarded {
   470  		return ErrDiscardedTxn
   471  	}
   472  	defer txn.Discard()
   473  	if len(txn.writes) == 0 {
   474  		return nil // Nothing to do.
   475  	}
   476  	managed := txn.db.IsManaged()
   477  	entries := make([]*Entry, 0, len(txn.pendingWrites)+1)
   478  	for _, e := range txn.pendingWrites {
   479  		if managed && e.Key.Version == 0 {
   480  			return fmt.Errorf("version of key %x not specified for managed db", e.Key.UserKey)
   481  		}
   482  		e.meta |= bitTxn
   483  		entries = append(entries, e)
   484  	}
   485  	sort.Slice(entries, func(i, j int) bool {
   486  		return entries[i].Key.Compare(entries[j].Key) < 0
   487  	})
   488  	var commitTs uint64
   489  	state := txn.db.orc
   490  	state.writeLock.Lock()
   491  	if !managed {
   492  		commitTs = state.newCommitTs(txn)
   493  		if commitTs == 0 {
   494  			state.writeLock.Unlock()
   495  			return ErrConflict
   496  		}
   497  		for _, e := range entries {
   498  			// Suffix the keys with commit ts, so the key versions are sorted in
   499  			// descending order of commit timestamp.
   500  			e.Key.Version = commitTs
   501  		}
   502  	}
   503  	// The txnKey entry is used for mark the transaction boundary, the value here is used for assertion.
   504  	e := &Entry{
   505  		Key:   y.KeyWithTs(txnKey, commitTs),
   506  		Value: []byte(strconv.FormatUint(commitTs, 10)),
   507  		meta:  bitFinTxn,
   508  	}
   509  	entries = append(entries, e)
   510  
   511  	req, err := txn.db.sendToWriteCh(entries)
   512  	state.writeLock.Unlock()
   513  	if err != nil {
   514  		return err
   515  	}
   516  
   517  	req.Wait()
   518  	state.doneCommit(commitTs)
   519  
   520  	return nil
   521  }
   522  
   523  // NewTransaction creates a new transaction. Badger supports concurrent execution of transactions,
   524  // providing serializable snapshot isolation, avoiding write skews. Badger achieves this by tracking
   525  // the keys read and at Commit time, ensuring that these read keys weren't concurrently modified by
   526  // another transaction.
   527  //
   528  // For read-only transactions, set update to false. In this mode, we don't track the rows read for
   529  // any changes. Thus, any long running iterations done in this mode wouldn't pay this overhead.
   530  //
   531  // Running transactions concurrently is OK. However, a transaction itself isn't thread safe, and
   532  // should only be run serially. It doesn't matter if a transaction is created by one goroutine and
   533  // passed down to other, as long as the Txn APIs are called serially.
   534  //
   535  // When you create a new transaction, it is absolutely essential to call
   536  // Discard(). This should be done irrespective of what the update param is set
   537  // to. Commit API internally runs Discard, but running it twice wouldn't cause
   538  // any issues.
   539  //
   540  //  txn := db.NewTransaction(false)
   541  //  defer txn.Discard()
   542  //  // Call various APIs.
   543  func (db *DB) NewTransaction(update bool) *Txn {
   544  	if db.opt.ReadOnly {
   545  		// DB is read-only, force read-only transaction.
   546  		update = false
   547  	}
   548  	readTs := db.orc.readTs()
   549  	txn := &Txn{
   550  		update: update,
   551  		db:     db,
   552  		count:  1,                       // One extra entry for BitFin.
   553  		size:   int64(len(txnKey) + 10), // Some buffer for the extra entry.
   554  		readTs: readTs,
   555  	}
   556  	if !db.IsManaged() {
   557  		txn.guard = db.resourceMgr.AcquireWithPayload(readTs)
   558  	} else {
   559  		txn.guard = db.resourceMgr.Acquire()
   560  	}
   561  	if update {
   562  		txn.pendingWrites = make(map[string]*Entry)
   563  		txn.db.orc.addRef()
   564  	}
   565  	return txn
   566  }
   567  
   568  // View executes a function creating and managing a read-only transaction for the user. Error
   569  // returned by the function is relayed by the View method.
   570  func (db *DB) View(fn func(txn *Txn) error) error {
   571  	txn := db.NewTransaction(false)
   572  	if db.IsManaged() {
   573  		txn.SetReadTS(math.MaxUint64)
   574  	}
   575  	defer txn.Discard()
   576  
   577  	return fn(txn)
   578  }
   579  
   580  // SetReadTS reads the DB with a given TS, it can only be used in a managed DB.
   581  func (txn *Txn) SetReadTS(readTS uint64) {
   582  	y.Assert(txn.db.IsManaged())
   583  	txn.readTs = readTS
   584  }
   585  
   586  // Update executes a function, creating and managing a read-write transaction
   587  // for the user. Error returned by the function is relayed by the Update method.
   588  func (db *DB) Update(fn func(txn *Txn) error) error {
   589  	txn := db.NewTransaction(true)
   590  	defer txn.Discard()
   591  
   592  	if err := fn(txn); err != nil {
   593  		return err
   594  	}
   595  
   596  	return txn.Commit()
   597  }