github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/transaction.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package badger
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"math"
    23  	"sort"
    24  	"strconv"
    25  	"sync"
    26  	"sync/atomic"
    27  
    28  	"github.com/coocood/badger/epoch"
    29  	"github.com/coocood/badger/y"
    30  	"github.com/dgryski/go-farm"
    31  	"github.com/pingcap/errors"
    32  )
    33  
    34  type oracle struct {
    35  	// curRead must be at the top for memory alignment. See issue #311.
    36  	curRead   uint64 // Managed by the mutex.
    37  	refCount  int64
    38  	isManaged bool // Does not change value, so no locking required.
    39  
    40  	sync.Mutex
    41  	writeLock  sync.Mutex
    42  	nextCommit uint64
    43  
    44  	// commits stores a key fingerprint and latest commit counter for it.
    45  	// refCount is used to clear out commits map to avoid a memory blowup.
    46  	commits map[uint64]uint64
    47  }
    48  
    49  func (o *oracle) addRef() {
    50  	atomic.AddInt64(&o.refCount, 1)
    51  }
    52  
    53  func (o *oracle) decrRef() {
    54  	if count := atomic.AddInt64(&o.refCount, -1); count == 0 {
    55  		// Clear out commits maps to release memory.
    56  		o.Lock()
    57  		// Avoids the race where something new is added to commitsMap
    58  		// after we check refCount and before we take Lock.
    59  		if atomic.LoadInt64(&o.refCount) != 0 {
    60  			o.Unlock()
    61  			return
    62  		}
    63  		if len(o.commits) >= 1000 { // If the map is still small, let it slide.
    64  			o.commits = make(map[uint64]uint64)
    65  		}
    66  		o.Unlock()
    67  	}
    68  }
    69  
    70  func (o *oracle) readTs() uint64 {
    71  	if o.isManaged {
    72  		return math.MaxUint64
    73  	}
    74  	return atomic.LoadUint64(&o.curRead)
    75  }
    76  
    77  func (o *oracle) commitTs() uint64 {
    78  	o.Lock()
    79  	defer o.Unlock()
    80  	return o.nextCommit
    81  }
    82  
    83  // hasConflict must be called while having a lock.
    84  func (o *oracle) hasConflict(txn *Txn) bool {
    85  	if len(txn.reads) == 0 {
    86  		return false
    87  	}
    88  	for _, ro := range txn.reads {
    89  		if ts, has := o.commits[ro]; has && ts > txn.readTs {
    90  			return true
    91  		}
    92  	}
    93  	return false
    94  }
    95  
    96  func (o *oracle) newCommitTs(txn *Txn) uint64 {
    97  	o.Lock()
    98  	defer o.Unlock()
    99  
   100  	if o.hasConflict(txn) {
   101  		return 0
   102  	}
   103  
   104  	var ts uint64
   105  	if !o.isManaged {
   106  		// This is the general case, when user doesn't specify the read and commit ts.
   107  		ts = o.nextCommit
   108  		o.nextCommit++
   109  
   110  	} else {
   111  		// If commitTs is set, use it instead.
   112  		ts = txn.commitTs
   113  	}
   114  
   115  	for _, w := range txn.writes {
   116  		o.commits[w] = ts // Update the commitTs.
   117  	}
   118  	return ts
   119  }
   120  
   121  func (o *oracle) allocTs() uint64 {
   122  	o.Lock()
   123  	ts := o.nextCommit
   124  	o.nextCommit++
   125  	o.Unlock()
   126  	return ts
   127  }
   128  
   129  func (o *oracle) doneCommit(cts uint64) {
   130  	if o.isManaged {
   131  		// No need to update anything.
   132  		return
   133  	}
   134  
   135  	for {
   136  		curRead := atomic.LoadUint64(&o.curRead)
   137  		if cts <= curRead {
   138  			return
   139  		}
   140  		atomic.CompareAndSwapUint64(&o.curRead, curRead, cts)
   141  	}
   142  }
   143  
   144  // Txn represents a Badger transaction.
   145  type Txn struct {
   146  	readTs   uint64
   147  	commitTs uint64
   148  
   149  	update bool     // update is used to conditionally keep track of reads.
   150  	reads  []uint64 // contains fingerprints of keys read.
   151  	writes []uint64 // contains fingerprints of keys written.
   152  
   153  	pendingWrites map[string]*Entry // cache stores any writes done by txn.
   154  
   155  	db        *DB
   156  	discarded bool
   157  	guard     *epoch.Guard
   158  
   159  	size         int64
   160  	count        int64
   161  	numIterators int32
   162  	blobCache    map[uint32]*blobCache
   163  }
   164  
   165  type pendingWritesIterator struct {
   166  	entries  []*Entry
   167  	nextIdx  int
   168  	readTs   uint64
   169  	reversed bool
   170  }
   171  
   172  func (pi *pendingWritesIterator) Next() {
   173  	pi.nextIdx++
   174  }
   175  
   176  func (pi *pendingWritesIterator) NextVersion() bool {
   177  	// We do not support adding multiple versions in a transaction.
   178  	return false
   179  }
   180  
   181  func (pi *pendingWritesIterator) Rewind() {
   182  	pi.nextIdx = 0
   183  }
   184  
   185  func (pi *pendingWritesIterator) Seek(key []byte) {
   186  	pi.nextIdx = sort.Search(len(pi.entries), func(idx int) bool {
   187  		cmp := bytes.Compare(pi.entries[idx].Key.UserKey, key)
   188  		if !pi.reversed {
   189  			return cmp >= 0
   190  		}
   191  		return cmp <= 0
   192  	})
   193  }
   194  
   195  func (pi *pendingWritesIterator) Key() y.Key {
   196  	y.Assert(pi.Valid())
   197  	entry := pi.entries[pi.nextIdx]
   198  	return y.KeyWithTs(entry.Key.UserKey, pi.readTs)
   199  }
   200  
   201  func (pi *pendingWritesIterator) Value() y.ValueStruct {
   202  	y.Assert(pi.Valid())
   203  	entry := pi.entries[pi.nextIdx]
   204  	return y.ValueStruct{
   205  		Value:    entry.Value,
   206  		Meta:     entry.meta,
   207  		UserMeta: entry.UserMeta,
   208  		Version:  pi.readTs,
   209  	}
   210  }
   211  
   212  func (pi *pendingWritesIterator) FillValue(vs *y.ValueStruct) {
   213  	entry := pi.entries[pi.nextIdx]
   214  	vs.Value = entry.Value
   215  	vs.Meta = entry.meta
   216  	vs.UserMeta = entry.UserMeta
   217  	vs.Version = pi.readTs
   218  }
   219  
   220  func (pi *pendingWritesIterator) Valid() bool {
   221  	return pi.nextIdx < len(pi.entries)
   222  }
   223  
   224  func (txn *Txn) newPendingWritesIterator(reversed bool) *pendingWritesIterator {
   225  	if !txn.update || len(txn.pendingWrites) == 0 {
   226  		return nil
   227  	}
   228  	entries := make([]*Entry, 0, len(txn.pendingWrites))
   229  	for _, e := range txn.pendingWrites {
   230  		entries = append(entries, e)
   231  	}
   232  	// Number of pending writes per transaction shouldn't be too big in general.
   233  	sort.Slice(entries, func(i, j int) bool {
   234  		cmp := entries[i].Key.Compare(entries[j].Key)
   235  		if !reversed {
   236  			return cmp < 0
   237  		}
   238  		return cmp > 0
   239  	})
   240  	return &pendingWritesIterator{
   241  		readTs:   txn.readTs,
   242  		entries:  entries,
   243  		reversed: reversed,
   244  	}
   245  }
   246  
   247  func (txn *Txn) checkSize(e *Entry) error {
   248  	if len(e.UserMeta) > 255 {
   249  		return ErrUserMetaTooLarge
   250  	}
   251  	// Extra bytes for version in key.
   252  	size := int64(e.estimateSize()) + 10
   253  	if size >= txn.db.opt.MaxMemTableSize {
   254  		return ErrTxnTooBig
   255  	}
   256  	txn.count++
   257  	txn.size += size
   258  	return nil
   259  }
   260  
   261  // Set adds a key-value pair to the database.
   262  //
   263  // It will return ErrReadOnlyTxn if update flag was set to false when creating the
   264  // transaction.
   265  func (txn *Txn) Set(key, val []byte) error {
   266  	if txn.db.IsManaged() {
   267  		return ErrManagedTxn
   268  	}
   269  	e := &Entry{
   270  		Key:   y.KeyWithTs(key, 0),
   271  		Value: val,
   272  	}
   273  	return txn.SetEntry(e)
   274  }
   275  
   276  // SetWithMeta adds a key-value pair to the database, along with a metadata
   277  // byte. This byte is stored alongside the key, and can be used as an aid to
   278  // interpret the value or store other contextual bits corresponding to the
   279  // key-value pair.
   280  func (txn *Txn) SetWithMeta(key, val []byte, meta byte) error {
   281  	if txn.db.IsManaged() {
   282  		return ErrManagedTxn
   283  	}
   284  	e := &Entry{Key: y.KeyWithTs(key, 0), Value: val, UserMeta: []byte{meta}}
   285  	return txn.SetEntry(e)
   286  }
   287  
   288  func (txn *Txn) SetWithMetaSlice(key, val, meta []byte) error {
   289  	if txn.db.IsManaged() {
   290  		return ErrManagedTxn
   291  	}
   292  	e := &Entry{Key: y.KeyWithTs(key, 0), Value: val, UserMeta: meta}
   293  	return txn.SetEntry(e)
   294  }
   295  
   296  func (txn *Txn) modify(e *Entry) error {
   297  	if !txn.update {
   298  		return ErrReadOnlyTxn
   299  	} else if txn.discarded {
   300  		return ErrDiscardedTxn
   301  	} else if e.Key.IsEmpty() {
   302  		return ErrEmptyKey
   303  	} else if e.Key.Len() > maxKeySize {
   304  		return exceedsMaxKeySizeError(e.Key.UserKey)
   305  	} else if int64(len(e.Value)) > txn.db.opt.ValueLogFileSize {
   306  		return exceedsMaxValueSizeError(e.Value, txn.db.opt.ValueLogFileSize)
   307  	}
   308  	if err := txn.checkSize(e); err != nil {
   309  		return err
   310  	}
   311  
   312  	fp := farm.Fingerprint64(e.Key.UserKey) // Avoid dealing with byte arrays.
   313  	txn.writes = append(txn.writes, fp)
   314  	txn.pendingWrites[string(e.Key.UserKey)] = e
   315  	return nil
   316  }
   317  
   318  // SetEntry takes an Entry struct and adds the key-value pair in the struct, along
   319  // with other metadata to the database.
   320  func (txn *Txn) SetEntry(e *Entry) error {
   321  	return txn.modify(e)
   322  }
   323  
   324  // Delete deletes a key. This is done by adding a delete marker for the key at commit timestamp.
   325  // Any reads happening before this timestamp would be unaffected. Any reads after this commit would
   326  // see the deletion.
   327  func (txn *Txn) Delete(key []byte) error {
   328  	e := &Entry{
   329  		Key:  y.KeyWithTs(key, 0),
   330  		meta: bitDelete,
   331  	}
   332  	return txn.modify(e)
   333  }
   334  
   335  // Get looks for key and returns corresponding Item.
   336  // If key is not found, ErrKeyNotFound is returned.
   337  func (txn *Txn) Get(key []byte) (item *Item, rerr error) {
   338  	if len(key) == 0 {
   339  		return nil, ErrEmptyKey
   340  	} else if txn.discarded {
   341  		return nil, ErrDiscardedTxn
   342  	}
   343  
   344  	item = new(Item)
   345  	if txn.update {
   346  		if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key.UserKey) {
   347  			if isDeleted(e.meta) {
   348  				return nil, ErrKeyNotFound
   349  			}
   350  			// Fulfill from cache.
   351  			item.meta = e.meta
   352  			item.vptr = e.Value
   353  			item.userMeta = e.UserMeta
   354  			item.key.UserKey = key
   355  			item.key.Version = txn.readTs
   356  			// We probably don't need to set db on item here.
   357  			return item, nil
   358  		}
   359  		// Only track reads if this is update txn. No need to track read if txn serviced it
   360  		// internally.
   361  		fp := farm.Fingerprint64(key)
   362  		txn.reads = append(txn.reads, fp)
   363  	}
   364  
   365  	seek := y.KeyWithTs(key, txn.readTs)
   366  	var vs y.ValueStruct
   367  	for {
   368  		vs = txn.db.get(seek)
   369  		if !vs.Valid() {
   370  			return nil, ErrKeyNotFound
   371  		}
   372  		if isDeleted(vs.Meta) {
   373  			return nil, ErrKeyNotFound
   374  		}
   375  		break
   376  	}
   377  
   378  	item.key.UserKey = key
   379  	item.key.Version = vs.Version
   380  	item.meta = vs.Meta
   381  	item.userMeta = vs.UserMeta
   382  	item.db = txn.db
   383  	item.vptr = vs.Value
   384  	item.txn = txn
   385  	return item, nil
   386  }
   387  
   388  type keyValuePair struct {
   389  	key   y.Key
   390  	hash  uint64
   391  	val   y.ValueStruct
   392  	found bool
   393  }
   394  
   395  // MultiGet gets items for keys, if not found, the corresponding item will be nil.
   396  // It only supports read-only transaction for simplicity.
   397  func (txn *Txn) MultiGet(keys [][]byte) (items []*Item, err error) {
   398  	if txn.update {
   399  		return nil, errors.New("not supported")
   400  	}
   401  	if txn.discarded {
   402  		return nil, ErrDiscardedTxn
   403  	}
   404  	keyValuePairs := make([]keyValuePair, len(keys))
   405  	for i, key := range keys {
   406  		if len(key) == 0 {
   407  			return nil, ErrEmptyKey
   408  		}
   409  		keyValuePairs[i].hash = farm.Fingerprint64(key)
   410  		keyValuePairs[i].key = y.KeyWithTs(key, txn.readTs)
   411  	}
   412  	txn.db.multiGet(keyValuePairs)
   413  	items = make([]*Item, len(keys))
   414  	for i, pair := range keyValuePairs {
   415  		if pair.found && !isDeleted(pair.val.Meta) {
   416  			items[i] = &Item{
   417  				key: y.Key{
   418  					UserKey: keys[i],
   419  					Version: pair.val.Version,
   420  				},
   421  				meta:     pair.val.Meta,
   422  				userMeta: pair.val.UserMeta,
   423  				db:       txn.db,
   424  				vptr:     pair.val.Value,
   425  				txn:      txn,
   426  			}
   427  		}
   428  	}
   429  	return items, nil
   430  }
   431  
   432  // Discard discards a created transaction. This method is very important and must be called. Commit
   433  // method calls this internally, however, calling this multiple times doesn't cause any issues. So,
   434  // this can safely be called via a defer right when transaction is created.
   435  //
   436  // NOTE: If any operations are run on a discarded transaction, ErrDiscardedTxn is returned.
   437  func (txn *Txn) Discard() {
   438  	if txn.discarded { // Avoid a re-run.
   439  		return
   440  	}
   441  	if atomic.LoadInt32(&txn.numIterators) > 0 {
   442  		panic("Unclosed iterator at time of Txn.Discard.")
   443  	}
   444  	txn.discarded = true
   445  	txn.blobCache = nil
   446  	if txn.update {
   447  		txn.db.orc.decrRef()
   448  	}
   449  	txn.guard.Done()
   450  }
   451  
   452  // Commit commits the transaction, following these steps:
   453  //
   454  // 1. If there are no writes, return immediately.
   455  //
   456  // 2. Check if read rows were updated since txn started. If so, return ErrConflict.
   457  //
   458  // 3. If no conflict, generate a commit timestamp and update written rows' commit ts.
   459  //
   460  // 4. Batch up all writes, write them to value log and LSM tree.
   461  //
   462  // If error is nil, the transaction is successfully committed. In case of a non-nil error, the LSM
   463  // tree won't be updated, so there's no need for any rollback.
   464  func (txn *Txn) Commit() error {
   465  	if txn.discarded {
   466  		return ErrDiscardedTxn
   467  	}
   468  	defer txn.Discard()
   469  	if len(txn.writes) == 0 {
   470  		return nil // Nothing to do.
   471  	}
   472  	managed := txn.db.IsManaged()
   473  	entries := make([]*Entry, 0, len(txn.pendingWrites)+1)
   474  	for _, e := range txn.pendingWrites {
   475  		if managed && e.Key.Version == 0 {
   476  			return fmt.Errorf("version of key %x not specified for managed db", e.Key.UserKey)
   477  		}
   478  		e.meta |= bitTxn
   479  		entries = append(entries, e)
   480  	}
   481  	sort.Slice(entries, func(i, j int) bool {
   482  		return entries[i].Key.Compare(entries[j].Key) < 0
   483  	})
   484  	var commitTs uint64
   485  	state := txn.db.orc
   486  	state.writeLock.Lock()
   487  	if !managed {
   488  		commitTs = state.newCommitTs(txn)
   489  		if commitTs == 0 {
   490  			state.writeLock.Unlock()
   491  			return ErrConflict
   492  		}
   493  		for _, e := range entries {
   494  			// Suffix the keys with commit ts, so the key versions are sorted in
   495  			// descending order of commit timestamp.
   496  			e.Key.Version = commitTs
   497  		}
   498  	}
   499  	// The txnKey entry is used for mark the transaction boundary, the value here is used for assertion.
   500  	e := &Entry{
   501  		Key:   y.KeyWithTs(txnKey, commitTs),
   502  		Value: []byte(strconv.FormatUint(commitTs, 10)),
   503  		meta:  bitFinTxn,
   504  	}
   505  	entries = append(entries, e)
   506  
   507  	req, err := txn.db.sendToWriteCh(entries)
   508  	state.writeLock.Unlock()
   509  	if err != nil {
   510  		return err
   511  	}
   512  
   513  	req.Wait()
   514  	state.doneCommit(commitTs)
   515  
   516  	return nil
   517  }
   518  
   519  // NewTransaction creates a new transaction. Badger supports concurrent execution of transactions,
   520  // providing serializable snapshot isolation, avoiding write skews. Badger achieves this by tracking
   521  // the keys read and at Commit time, ensuring that these read keys weren't concurrently modified by
   522  // another transaction.
   523  //
   524  // For read-only transactions, set update to false. In this mode, we don't track the rows read for
   525  // any changes. Thus, any long running iterations done in this mode wouldn't pay this overhead.
   526  //
   527  // Running transactions concurrently is OK. However, a transaction itself isn't thread safe, and
   528  // should only be run serially. It doesn't matter if a transaction is created by one goroutine and
   529  // passed down to other, as long as the Txn APIs are called serially.
   530  //
   531  // When you create a new transaction, it is absolutely essential to call
   532  // Discard(). This should be done irrespective of what the update param is set
   533  // to. Commit API internally runs Discard, but running it twice wouldn't cause
   534  // any issues.
   535  //
   536  //  txn := db.NewTransaction(false)
   537  //  defer txn.Discard()
   538  //  // Call various APIs.
   539  func (db *DB) NewTransaction(update bool) *Txn {
   540  	if db.opt.ReadOnly {
   541  		// DB is read-only, force read-only transaction.
   542  		update = false
   543  	}
   544  	readTs := db.orc.readTs()
   545  	txn := &Txn{
   546  		update: update,
   547  		db:     db,
   548  		count:  1,                       // One extra entry for BitFin.
   549  		size:   int64(len(txnKey) + 10), // Some buffer for the extra entry.
   550  		readTs: readTs,
   551  	}
   552  	if !db.IsManaged() {
   553  		txn.guard = db.resourceMgr.AcquireWithPayload(readTs)
   554  	} else {
   555  		txn.guard = db.resourceMgr.Acquire()
   556  	}
   557  	if update {
   558  		txn.pendingWrites = make(map[string]*Entry)
   559  		txn.db.orc.addRef()
   560  	}
   561  	return txn
   562  }
   563  
   564  // View executes a function creating and managing a read-only transaction for the user. Error
   565  // returned by the function is relayed by the View method.
   566  func (db *DB) View(fn func(txn *Txn) error) error {
   567  	txn := db.NewTransaction(false)
   568  	if db.IsManaged() {
   569  		txn.SetReadTS(math.MaxUint64)
   570  	}
   571  	defer txn.Discard()
   572  
   573  	return fn(txn)
   574  }
   575  
   576  // SetReadTS reads the DB with a given TS, it can only be used in a managed DB.
   577  func (txn *Txn) SetReadTS(readTS uint64) {
   578  	y.Assert(txn.db.IsManaged())
   579  	txn.readTs = readTS
   580  }
   581  
   582  // Update executes a function, creating and managing a read-write transaction
   583  // for the user. Error returned by the function is relayed by the Update method.
   584  func (db *DB) Update(fn func(txn *Txn) error) error {
   585  	txn := db.NewTransaction(true)
   586  	defer txn.Discard()
   587  
   588  	if err := fn(txn); err != nil {
   589  		return err
   590  	}
   591  
   592  	return txn.Commit()
   593  }