github.com/vescale/zgraph@v0.0.0-20230410094002-959c02d50f95/storage/transaction.go (about)

     1  // Copyright 2022 zGraph Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package storage
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"math"
    21  	"sync"
    22  	"time"
    23  
    24  	"github.com/cenkalti/backoff"
    25  	"github.com/cockroachdb/pebble"
    26  	"github.com/vescale/zgraph/storage/kv"
    27  	"github.com/vescale/zgraph/storage/latch"
    28  	"github.com/vescale/zgraph/storage/mvcc"
    29  	"github.com/vescale/zgraph/storage/resolver"
    30  	"go.uber.org/atomic"
    31  )
    32  
    33  // Txn represents a transaction implemented beyond the low-level key/value storage.
    34  type Txn struct {
    35  	mu        sync.Mutex
    36  	vp        kv.VersionProvider
    37  	db        *pebble.DB
    38  	us        *UnionStore
    39  	latches   *latch.LatchesScheduler
    40  	resolver  *resolver.Scheduler
    41  	valid     bool
    42  	snapshot  kv.Snapshot
    43  	startTime time.Time
    44  	startVer  kv.Version
    45  	commitVer kv.Version
    46  	setCnt    int64
    47  	lockedCnt int
    48  }
    49  
    50  // Get implements the Transaction interface.
    51  func (txn *Txn) Get(ctx context.Context, k kv.Key) ([]byte, error) {
    52  	return txn.us.Get(ctx, k)
    53  }
    54  
    55  // Iter creates an Iterator positioned on the first entry that k <= entry's key.
    56  // If such entry is not found, it returns an invalid Iterator with no error.
    57  // It yields only keys that < upperBound. If upperBound is nil, it means the upperBound is unbounded.
    58  // The Iterator must be Closed after use.
    59  func (txn *Txn) Iter(lowerBound, upperBound kv.Key) (kv.Iterator, error) {
    60  	return txn.us.Iter(lowerBound, upperBound)
    61  }
    62  
    63  // IterReverse creates a reversed Iterator positioned on the first entry which key is less than k.
    64  func (txn *Txn) IterReverse(lowerBound, upperBound kv.Key) (kv.Iterator, error) {
    65  	return txn.us.IterReverse(lowerBound, upperBound)
    66  }
    67  
    68  // Set implements the Transaction interface.
    69  // It sets the value for key k as v into kv store.
    70  // v must NOT be nil or empty, otherwise it returns ErrCannotSetNilValue.
    71  func (txn *Txn) Set(k kv.Key, v []byte) error {
    72  	txn.setCnt++
    73  	return txn.us.MemBuffer().Set(k, v)
    74  }
    75  
    76  // Delete implements the Transaction interface. It removes the entry for key k from kv store.
    77  func (txn *Txn) Delete(k kv.Key) error {
    78  	return txn.us.MemBuffer().Delete(k)
    79  }
    80  
    81  // StartVer implements the Transaction interface.
    82  func (txn *Txn) StartVer() kv.Version {
    83  	return txn.startVer
    84  }
    85  
    86  // Snapshot implements the Transaction interface.
    87  func (txn *Txn) Snapshot() kv.Snapshot {
    88  	return txn.snapshot
    89  }
    90  
    91  // BatchGet implements the Transaction interface.
    92  // It gets kv from the memory buffer of statement and transaction, and the kv storage.
    93  // Do not use len(value) == 0 or value == nil to represent non-exist.
    94  // If a key doesn't exist, there shouldn't be any corresponding entry in the result map.
    95  func (txn *Txn) BatchGet(ctx context.Context, keys []kv.Key) (map[string][]byte, error) {
    96  	return NewBufferBatchGetter(txn.us.MemBuffer(), txn.Snapshot()).BatchGet(ctx, keys)
    97  }
    98  
    99  // Size implements the Transaction interface. It returns sum of keys and values length.
   100  func (txn *Txn) Size() int {
   101  	return txn.us.MemBuffer().Size()
   102  }
   103  
   104  // Len implements the Transaction interface. It returns the number of entries in the DB.
   105  func (txn *Txn) Len() int {
   106  	return txn.us.MemBuffer().Len()
   107  }
   108  
   109  // Reset implements the Transaction interface. It resets the Transaction to initial states.
   110  func (txn *Txn) Reset() {
   111  	txn.us.MemBuffer().Reset()
   112  }
   113  
   114  func (txn *Txn) Commit(_ context.Context) error {
   115  	if !txn.valid {
   116  		return kv.ErrInvalidTxn
   117  	}
   118  	defer txn.close()
   119  
   120  	// Sanity check for start timestamp of the current transaction.
   121  	if txn.startVer == mvcc.LockVer {
   122  		return kv.ErrInvalidStartVer
   123  	}
   124  
   125  	committer := &committer{
   126  		db:       txn.db,
   127  		memDB:    txn.us.MemBuffer(),
   128  		resolver: txn.resolver,
   129  		startVer: txn.startVer,
   130  	}
   131  	err := committer.init(txn.startTime)
   132  	if err != nil {
   133  		return err
   134  	}
   135  	if committer.length() == 0 {
   136  		return nil
   137  	}
   138  	keys := committer.keys()
   139  
   140  	err = backoff.RetryNotify(func() error {
   141  		// Note: don't use `defer txn.latches.UnLock(lock)` here. we need to keep the
   142  		// lock fine-grain.
   143  		// Because the subsequent routine may time-consumed:
   144  		//   - CheckTxnStatus: will be slow if the IO usage is high.
   145  		//   - Resolve: will block if the worker queue full.
   146  		lock := txn.latches.Lock(txn.startVer, keys)
   147  		err := committer.prepare()
   148  		errg, ok := err.(*kv.ErrGroup)
   149  		if !ok {
   150  			txn.latches.UnLock(lock)
   151  			return err
   152  		}
   153  		// Prepare transaction successfully means all lock are written into the low-level
   154  		// storage.
   155  		if len(errg.Errors) == 0 {
   156  			commitVer := txn.vp.CurrentVersion()
   157  			txn.commitVer = commitVer
   158  			committer.commitVer = commitVer
   159  			lock.SetCommitVer(commitVer)
   160  			txn.latches.UnLock(lock)
   161  			return nil
   162  		}
   163  		txn.latches.UnLock(lock)
   164  
   165  		rollbacks := map[kv.Version][]kv.Key{}
   166  		committed := map[kv.VersionPair][]kv.Key{}
   167  		for _, err := range errg.Errors {
   168  			// Try to resolve keys locked error.
   169  			lockedErr, ok := err.(*mvcc.LockedError)
   170  			if !ok {
   171  				return &backoff.PermanentError{Err: err}
   172  			}
   173  
   174  			status, err := resolver.CheckTxnStatus(txn.db, txn.vp, lockedErr.Primary, lockedErr.StartVer)
   175  			if err != nil {
   176  				return &backoff.PermanentError{Err: err}
   177  			}
   178  			switch status.Action {
   179  			case resolver.TxnActionNone:
   180  				// Transaction is still alive and try it letter.
   181  				continue
   182  
   183  			case resolver.TxnActionTTLExpireRollback,
   184  				resolver.TxnActionLockNotExistRollback:
   185  				// Resolve the current key.
   186  				rollbacks[lockedErr.StartVer] = append(rollbacks[lockedErr.StartVer], lockedErr.Key)
   187  				continue
   188  
   189  			default:
   190  				// TxnActionLockNotExistDoNothing
   191  				// Transaction committed: we try to resolve the current key and backoff.
   192  				pair := kv.VersionPair{StartVer: lockedErr.StartVer, CommitVer: status.CommitVer}
   193  				committed[pair] = append(committed[pair], lockedErr.Key)
   194  				continue
   195  			}
   196  		}
   197  
   198  		if len(rollbacks) > 0 {
   199  			for startVer, keys := range rollbacks {
   200  				txn.resolver.Resolve(keys, startVer, 0, nil)
   201  				committer.resolved = append(committer.resolved, startVer)
   202  			}
   203  		}
   204  		if len(committed) > 0 {
   205  			for pair, keys := range committed {
   206  				txn.resolver.Resolve(keys, pair.StartVer, pair.CommitVer, nil)
   207  			}
   208  		}
   209  
   210  		return resolver.ErrRetryable("resolving locks in transaction prepare staging")
   211  	}, expoBackoff(), BackoffErrReporter("committer.execute"))
   212  	if err != nil {
   213  		return err
   214  	}
   215  
   216  	return committer.commit()
   217  }
   218  
   219  // Rollback implements the Transaction interface. It undoes the transaction operations to KV store.
   220  func (txn *Txn) Rollback() error {
   221  	if !txn.valid {
   222  		return kv.ErrInvalidTxn
   223  	}
   224  	txn.close()
   225  	return nil
   226  }
   227  
   228  // String implements fmt.Stringer interface.
   229  func (txn *Txn) String() string {
   230  	return fmt.Sprintf("%d", txn.startVer)
   231  }
   232  
   233  func (txn *Txn) close() {
   234  	txn.valid = false
   235  }
   236  
   237  // committer represents the transaction 2 phase committer. It will calculate the
   238  // mutations and apply to the low-level storage.
   239  type committer struct {
   240  	db         *pebble.DB
   241  	memDB      *MemDB
   242  	resolver   *resolver.Scheduler
   243  	startVer   kv.Version
   244  	commitVer  kv.Version
   245  	resolved   []kv.Version
   246  	primaryIdx int
   247  	primaryKey kv.Key
   248  	lockTTL    uint64
   249  	handles    []MemKeyHandle
   250  
   251  	// counter of mutations
   252  	size, putCnt, delCnt, lockCnt, checkCnt int
   253  
   254  	// The commit status
   255  	mu struct {
   256  		sync.RWMutex
   257  		undeterminedErr error // undeterminedErr saves the rpc error we encounter when commit primary key.
   258  		committed       bool
   259  	}
   260  }
   261  
   262  // init initializes the keys and mutations.
   263  func (c *committer) init(startTime time.Time) error {
   264  	// Foreach all the changes cached in the memory buffer and build the mutations.
   265  	var err error
   266  	for it := c.memDB.IterWithFlags(nil, nil); it.Valid(); err = it.Next() {
   267  		// TODO: handle error properly
   268  		_ = err
   269  
   270  		var (
   271  			key   = it.Key()
   272  			flags = it.Flags()
   273  			value []byte
   274  			op    mvcc.Op
   275  		)
   276  
   277  		if !it.HasValue() {
   278  			if !flags.HasLocked() {
   279  				continue
   280  			}
   281  			op = mvcc.Op_Lock
   282  			c.lockCnt++
   283  		} else {
   284  			value = it.Value()
   285  			if len(value) > 0 {
   286  				op = mvcc.Op_Put
   287  				if flags.HasPresumeKeyNotExists() {
   288  					op = mvcc.Op_Insert
   289  				}
   290  				c.putCnt++
   291  			} else if flags.HasPresumeKeyNotExists() {
   292  				// delete-your-writes keys in optimistic txn need check not exists in prewrite-phase
   293  				// due to `Op_CheckNotExists` doesn't prewrite lock, so mark those keys should not be used in commit-phase.
   294  				op = mvcc.Op_CheckNotExists
   295  				c.checkCnt++
   296  				c.memDB.UpdateFlags(key, kv.SetPrewriteOnly)
   297  			} else if flags.HasNewlyInserted() {
   298  				// The delete-your-write keys in pessimistic transactions, only lock needed keys and skip
   299  				// other deletes for example the secondary index delete.
   300  				// Here if `tidb_constraint_check_in_place` is enabled and the transaction is in optimistic mode,
   301  				// the logic is same as the pessimistic mode.
   302  				if flags.HasLocked() {
   303  					op = mvcc.Op_Lock
   304  					c.lockCnt++
   305  				} else {
   306  					continue
   307  				}
   308  			} else {
   309  				op = mvcc.Op_Del
   310  				c.delCnt++
   311  			}
   312  
   313  			handle := it.Handle()
   314  			handle.op = op
   315  			handle.flags = flags
   316  			c.handles = append(c.handles, handle)
   317  			c.size += len(key) + len(value)
   318  		}
   319  
   320  		// Choose the first valid key as the primary key of the current transaction.
   321  		if len(c.primaryKey) == 0 && op != mvcc.Op_CheckNotExists {
   322  			c.primaryIdx = len(c.handles) - 1
   323  			c.primaryKey = key
   324  		}
   325  	}
   326  
   327  	if len(c.handles) == 0 {
   328  		return nil
   329  	}
   330  	c.lockTTL = txnLockTTL(startTime, c.size)
   331  
   332  	return nil
   333  }
   334  
   335  func (c *committer) length() int {
   336  	return len(c.handles)
   337  }
   338  
   339  // keys returns keys of all mutations in the current transaction.
   340  func (c *committer) keys() []kv.Key {
   341  	keys := make([]kv.Key, len(c.handles))
   342  	for i, h := range c.handles {
   343  		keys[i] = c.memDB.GetKeyByHandle(h)
   344  	}
   345  	return keys
   346  }
   347  
   348  // prepare implements the first stage of 2PC transaction model.
   349  func (c *committer) prepare() error {
   350  	var (
   351  		errs       []error
   352  		batch      = c.db.NewBatch()
   353  		primaryKey = c.primaryKey
   354  		startVer   = c.startVer
   355  		resolved   = c.resolved
   356  	)
   357  	defer batch.Close()
   358  
   359  	for _, h := range c.handles {
   360  		op := h.op
   361  		key := c.memDB.GetKeyByHandle(h)
   362  		enc := mvcc.Encode(key, mvcc.LockVer)
   363  		opt := pebble.IterOptions{
   364  			LowerBound: enc,
   365  		}
   366  		if op == mvcc.Op_Insert || op == mvcc.Op_CheckNotExists {
   367  			iter := c.db.NewIter(&opt)
   368  			iter.First()
   369  			val, err := getValue(iter, key, startVer, resolved)
   370  			_ = iter.Close()
   371  			if err != nil {
   372  				errs = append(errs, err)
   373  				continue
   374  			}
   375  			if val != nil {
   376  				err = &kv.ErrKeyAlreadyExist{
   377  					Key: key,
   378  				}
   379  				errs = append(errs, err)
   380  				continue
   381  			}
   382  		}
   383  		if op == mvcc.Op_CheckNotExists {
   384  			continue
   385  		}
   386  
   387  		err := func() error {
   388  			iter := c.db.NewIter(&opt)
   389  			iter.First()
   390  			defer iter.Close()
   391  
   392  			decoder := mvcc.LockDecoder{ExpectKey: key}
   393  			exists, err := decoder.Decode(iter)
   394  			if err != nil {
   395  				return err
   396  			}
   397  
   398  			// There is a lock exists.
   399  			if exists && decoder.Lock.StartVer != startVer {
   400  				return decoder.Lock.LockErr(key)
   401  			}
   402  
   403  			// Check conflicts
   404  			vdecoder := mvcc.ValueDecoder{ExpectKey: key}
   405  			exists, err = vdecoder.Decode(iter)
   406  			if err != nil {
   407  				return err
   408  			}
   409  			if exists && vdecoder.Value.CommitVer > startVer {
   410  				return &kv.ErrConflict{
   411  					StartVer:          startVer,
   412  					ConflictStartVer:  vdecoder.Value.StartVer,
   413  					ConflictCommitVer: vdecoder.Value.CommitVer,
   414  					Key:               key,
   415  				}
   416  			}
   417  			return nil
   418  		}()
   419  		if err != nil {
   420  			errs = append(errs, err)
   421  			continue
   422  		}
   423  
   424  		// Append the current row key into the write batch.
   425  		if op == mvcc.Op_Insert {
   426  			op = mvcc.Op_Put
   427  		}
   428  		val, _ := c.memDB.GetValueByHandle(h)
   429  		l := mvcc.Lock{
   430  			StartVer: startVer,
   431  			Primary:  primaryKey,
   432  			Value:    val,
   433  			Op:       op,
   434  			TTL:      c.lockTTL,
   435  		}
   436  		writeVal, err := l.MarshalBinary()
   437  		if err != nil {
   438  			errs = append(errs, err)
   439  			continue
   440  		}
   441  		err = batch.Set(enc, writeVal, nil)
   442  		if err != nil {
   443  			errs = append(errs, err)
   444  			continue
   445  		}
   446  	}
   447  
   448  	// Commit the current write batch into the low-level storage engine.
   449  	if err := batch.Commit(nil); err != nil {
   450  		return err
   451  	}
   452  
   453  	return &kv.ErrGroup{Errors: errs}
   454  }
   455  
   456  // commit implements the second stage of 2PC transaction model.
   457  func (c *committer) commit() error {
   458  	batch := c.db.NewBatch()
   459  	defer batch.Close()
   460  
   461  	// Commit primary key first.
   462  	err := resolver.Resolve(c.db, batch, c.primaryKey, c.startVer, c.commitVer)
   463  	if err != nil {
   464  		return err
   465  	}
   466  	err = batch.Commit(nil)
   467  	if err != nil {
   468  		return err
   469  	}
   470  
   471  	// The remained keys submit to resolver to resolve them asynchronously.
   472  	var remainedKeys []kv.Key
   473  	for i, h := range c.handles {
   474  		// The primary key had been committed.
   475  		if i == c.primaryIdx {
   476  			continue
   477  		}
   478  		if h.op == mvcc.Op_CheckNotExists {
   479  			continue
   480  		}
   481  
   482  		// Note: the keys stored in MemDB are reference to MemDB and its lifetime
   483  		// bound to the MemDB. We will release MemDB instance after the transaction
   484  		// committed. So we need to copy the keys, then submit them to the resolver.
   485  		key := c.memDB.GetKeyByHandle(h)
   486  		cpy := make(kv.Key, len(key))
   487  		copy(cpy, key)
   488  		remainedKeys = append(remainedKeys, cpy)
   489  	}
   490  	c.resolver.Resolve(remainedKeys, c.startVer, c.commitVer, nil)
   491  
   492  	return nil
   493  }
   494  
   495  const bytesPerMiB = 1024 * 1024
   496  
   497  // ttl = ttlFactor * sqrt(writeSizeInMiB)
   498  var ttlFactor = 6000
   499  
   500  // By default, locks after 3000ms is considered unusual (the client created the
   501  // lock might be dead). Other client may clean up this kind of lock.
   502  // For locks created recently, we will do backoff and retry.
   503  var defaultLockTTL uint64 = 3000
   504  
   505  // Global variable set by config file.
   506  var (
   507  	ManagedLockTTL uint64 = 20000 // 20s
   508  )
   509  
   510  var (
   511  	// PrewriteMaxBackoff is max sleep time of the `pre-write` command.
   512  	PrewriteMaxBackoff = atomic.NewUint64(40000)
   513  	// CommitMaxBackoff is max sleep time of the 'commit' command
   514  	CommitMaxBackoff = uint64(40000)
   515  )
   516  
   517  func txnLockTTL(startTime time.Time, txnSize int) uint64 {
   518  	// Increase lockTTL for large transactions.
   519  	// The formula is `ttl = ttlFactor * sqrt(sizeInMiB)`.
   520  	// When writeSize is less than 256KB, the base ttl is defaultTTL (3s);
   521  	// When writeSize is 1MiB, 4MiB, or 10MiB, ttl is 6s, 12s, 20s correspondingly;
   522  	lockTTL := defaultLockTTL
   523  	if txnSize >= int(kv.TxnCommitBatchSize.Load()) {
   524  		sizeMiB := float64(txnSize) / bytesPerMiB
   525  		lockTTL = uint64(float64(ttlFactor) * math.Sqrt(sizeMiB))
   526  		if lockTTL < defaultLockTTL {
   527  			lockTTL = defaultLockTTL
   528  		}
   529  		if lockTTL > ManagedLockTTL {
   530  			lockTTL = ManagedLockTTL
   531  		}
   532  	}
   533  
   534  	// Increase lockTTL by the transaction's read time.
   535  	// When resolving a lock, we compare current ver and startVer+lockTTL to decide whether to clean up. If a txn
   536  	// takes a long time to read, increasing its TTL will help to prevent it from been aborted soon after prewrite.
   537  	elapsed := time.Since(startTime) / time.Millisecond
   538  	return lockTTL + uint64(elapsed)
   539  }