github.com/KinWaiYuen/client-go/v2@v2.5.4/txnkv/txnlock/lock_resolver.go (about)

     1  // Copyright 2021 TiKV Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package txnlock
    16  
    17  import (
    18  	"bytes"
    19  	"container/list"
    20  	"context"
    21  	"encoding/hex"
    22  	"fmt"
    23  	"math"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/KinWaiYuen/client-go/v2/config"
    28  	tikverr "github.com/KinWaiYuen/client-go/v2/error"
    29  	"github.com/KinWaiYuen/client-go/v2/internal/client"
    30  	"github.com/KinWaiYuen/client-go/v2/internal/locate"
    31  	"github.com/KinWaiYuen/client-go/v2/internal/logutil"
    32  	"github.com/KinWaiYuen/client-go/v2/internal/retry"
    33  	"github.com/KinWaiYuen/client-go/v2/metrics"
    34  	"github.com/KinWaiYuen/client-go/v2/oracle"
    35  	"github.com/KinWaiYuen/client-go/v2/tikvrpc"
    36  	"github.com/KinWaiYuen/client-go/v2/util"
    37  	"github.com/pingcap/errors"
    38  	"github.com/pingcap/kvproto/pkg/kvrpcpb"
    39  	"go.uber.org/zap"
    40  )
    41  
    42  // ResolvedCacheSize is max number of cached txn status.
    43  const ResolvedCacheSize = 2048
    44  
    45  type storage interface {
    46  	// GetRegionCache gets the RegionCache.
    47  	GetRegionCache() *locate.RegionCache
    48  	// SendReq sends a request to TiKV.
    49  	SendReq(bo *retry.Backoffer, req *tikvrpc.Request, regionID locate.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error)
    50  	// GetOracle gets a timestamp oracle client.
    51  	GetOracle() oracle.Oracle
    52  }
    53  
    54  // LockResolver resolves locks and also caches resolved txn status.
    55  type LockResolver struct {
    56  	store                    storage
    57  	resolveLockLiteThreshold uint64
    58  	mu                       struct {
    59  		sync.RWMutex
    60  		// resolved caches resolved txns (FIFO, txn id -> txnStatus).
    61  		resolved       map[uint64]TxnStatus
    62  		recentResolved *list.List
    63  	}
    64  	testingKnobs struct {
    65  		meetLock func(locks []*Lock)
    66  	}
    67  }
    68  
    69  // NewLockResolver creates a new LockResolver instance.
    70  func NewLockResolver(store storage) *LockResolver {
    71  	r := &LockResolver{
    72  		store:                    store,
    73  		resolveLockLiteThreshold: config.GetGlobalConfig().TiKVClient.ResolveLockLiteThreshold,
    74  	}
    75  	r.mu.resolved = make(map[uint64]TxnStatus)
    76  	r.mu.recentResolved = list.New()
    77  	return r
    78  }
    79  
    80  // TxnStatus represents a txn's final status. It should be Lock or Commit or Rollback.
    81  type TxnStatus struct {
    82  	ttl         uint64
    83  	commitTS    uint64
    84  	action      kvrpcpb.Action
    85  	primaryLock *kvrpcpb.LockInfo
    86  }
    87  
    88  // IsCommitted returns true if the txn's final status is Commit.
    89  func (s TxnStatus) IsCommitted() bool { return s.ttl == 0 && s.commitTS > 0 }
    90  
    91  // CommitTS returns the txn's commitTS. It is valid iff `IsCommitted` is true.
    92  func (s TxnStatus) CommitTS() uint64 { return s.commitTS }
    93  
    94  // TTL returns the TTL of the transaction if the transaction is still alive.
    95  func (s TxnStatus) TTL() uint64 { return s.ttl }
    96  
    97  // Action returns what the CheckTxnStatus request have done to the transaction.
    98  func (s TxnStatus) Action() kvrpcpb.Action { return s.action }
    99  
   100  // StatusCacheable checks whether the transaction status is certain.True will be
   101  // returned if its status is certain:
   102  //     If transaction is already committed, the result could be cached.
   103  //     Otherwise:
   104  //       If l.LockType is pessimistic lock type:
   105  //           - if its primary lock is pessimistic too, the check txn status result should not be cached.
   106  //           - if its primary lock is prewrite lock type, the check txn status could be cached.
   107  //       If l.lockType is prewrite lock type:
   108  //           - always cache the check txn status result.
   109  // For prewrite locks, their primary keys should ALWAYS be the correct one and will NOT change.
   110  func (s TxnStatus) StatusCacheable() bool {
   111  	if s.IsCommitted() {
   112  		return true
   113  	}
   114  	if s.ttl == 0 {
   115  		if s.action == kvrpcpb.Action_NoAction ||
   116  			s.action == kvrpcpb.Action_LockNotExistRollback ||
   117  			s.action == kvrpcpb.Action_TTLExpireRollback {
   118  			return true
   119  		}
   120  	}
   121  	return false
   122  }
   123  
   124  // Lock represents a lock from tikv server.
   125  type Lock struct {
   126  	Key             []byte
   127  	Primary         []byte
   128  	TxnID           uint64
   129  	TTL             uint64
   130  	TxnSize         uint64
   131  	LockType        kvrpcpb.Op
   132  	UseAsyncCommit  bool
   133  	LockForUpdateTS uint64
   134  	MinCommitTS     uint64
   135  }
   136  
   137  func (l *Lock) String() string {
   138  	buf := bytes.NewBuffer(make([]byte, 0, 128))
   139  	buf.WriteString("key: ")
   140  	buf.WriteString(hex.EncodeToString(l.Key))
   141  	buf.WriteString(", primary: ")
   142  	buf.WriteString(hex.EncodeToString(l.Primary))
   143  	return fmt.Sprintf("%s, txnStartTS: %d, lockForUpdateTS:%d, minCommitTs:%d, ttl: %d, type: %s, UseAsyncCommit: %t, txnSize: %d",
   144  		buf.String(), l.TxnID, l.LockForUpdateTS, l.MinCommitTS, l.TTL, l.LockType, l.UseAsyncCommit, l.TxnSize)
   145  }
   146  
   147  // NewLock creates a new *Lock.
   148  func NewLock(l *kvrpcpb.LockInfo) *Lock {
   149  	return &Lock{
   150  		Key:             l.GetKey(),
   151  		Primary:         l.GetPrimaryLock(),
   152  		TxnID:           l.GetLockVersion(),
   153  		TTL:             l.GetLockTtl(),
   154  		TxnSize:         l.GetTxnSize(),
   155  		LockType:        l.LockType,
   156  		UseAsyncCommit:  l.UseAsyncCommit,
   157  		LockForUpdateTS: l.LockForUpdateTs,
   158  		MinCommitTS:     l.MinCommitTs,
   159  	}
   160  }
   161  
   162  func (lr *LockResolver) saveResolved(txnID uint64, status TxnStatus) {
   163  	lr.mu.Lock()
   164  	defer lr.mu.Unlock()
   165  
   166  	if _, ok := lr.mu.resolved[txnID]; ok {
   167  		return
   168  	}
   169  	lr.mu.resolved[txnID] = status
   170  	lr.mu.recentResolved.PushBack(txnID)
   171  	if len(lr.mu.resolved) > ResolvedCacheSize {
   172  		front := lr.mu.recentResolved.Front()
   173  		delete(lr.mu.resolved, front.Value.(uint64))
   174  		lr.mu.recentResolved.Remove(front)
   175  	}
   176  }
   177  
   178  func (lr *LockResolver) getResolved(txnID uint64) (TxnStatus, bool) {
   179  	lr.mu.RLock()
   180  	defer lr.mu.RUnlock()
   181  
   182  	s, ok := lr.mu.resolved[txnID]
   183  	return s, ok
   184  }
   185  
   186  // BatchResolveLocks resolve locks in a batch.
   187  // Used it in gcworker only!
   188  func (lr *LockResolver) BatchResolveLocks(bo *retry.Backoffer, locks []*Lock, loc locate.RegionVerID) (bool, error) {
   189  	if len(locks) == 0 {
   190  		return true, nil
   191  	}
   192  
   193  	metrics.LockResolverCountWithBatchResolve.Inc()
   194  
   195  	// The GCWorker kill all ongoing transactions, because it must make sure all
   196  	// locks have been cleaned before GC.
   197  	expiredLocks := locks
   198  
   199  	txnInfos := make(map[uint64]uint64)
   200  	startTime := time.Now()
   201  	for _, l := range expiredLocks {
   202  		if _, ok := txnInfos[l.TxnID]; ok {
   203  			continue
   204  		}
   205  		metrics.LockResolverCountWithExpired.Inc()
   206  
   207  		// Use currentTS = math.MaxUint64 means rollback the txn, no matter the lock is expired or not!
   208  		status, err := lr.getTxnStatus(bo, l.TxnID, l.Primary, 0, math.MaxUint64, true, false, l)
   209  		if err != nil {
   210  			return false, err
   211  		}
   212  
   213  		// If the transaction uses async commit, CheckTxnStatus will reject rolling back the primary lock.
   214  		// Then we need to check the secondary locks to determine the final status of the transaction.
   215  		if status.primaryLock != nil && status.primaryLock.UseAsyncCommit {
   216  			resolveData, err := lr.checkAllSecondaries(bo, l, &status)
   217  			if err == nil {
   218  				txnInfos[l.TxnID] = resolveData.commitTs
   219  				continue
   220  			}
   221  			if _, ok := errors.Cause(err).(*nonAsyncCommitLock); ok {
   222  				status, err = lr.getTxnStatus(bo, l.TxnID, l.Primary, 0, math.MaxUint64, true, true, l)
   223  				if err != nil {
   224  					return false, err
   225  				}
   226  			} else {
   227  				return false, err
   228  			}
   229  		}
   230  
   231  		if status.ttl > 0 {
   232  			logutil.BgLogger().Error("BatchResolveLocks fail to clean locks, this result is not expected!")
   233  			return false, errors.New("TiDB ask TiKV to rollback locks but it doesn't, the protocol maybe wrong")
   234  		}
   235  
   236  		txnInfos[l.TxnID] = status.commitTS
   237  	}
   238  	logutil.BgLogger().Info("BatchResolveLocks: lookup txn status",
   239  		zap.Duration("cost time", time.Since(startTime)),
   240  		zap.Int("num of txn", len(txnInfos)))
   241  
   242  	listTxnInfos := make([]*kvrpcpb.TxnInfo, 0, len(txnInfos))
   243  	for txnID, status := range txnInfos {
   244  		listTxnInfos = append(listTxnInfos, &kvrpcpb.TxnInfo{
   245  			Txn:    txnID,
   246  			Status: status,
   247  		})
   248  	}
   249  
   250  	req := tikvrpc.NewRequest(tikvrpc.CmdResolveLock, &kvrpcpb.ResolveLockRequest{TxnInfos: listTxnInfos})
   251  	req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds())
   252  	startTime = time.Now()
   253  	resp, err := lr.store.SendReq(bo, req, loc, client.ReadTimeoutShort)
   254  	if err != nil {
   255  		return false, errors.Trace(err)
   256  	}
   257  
   258  	regionErr, err := resp.GetRegionError()
   259  	if err != nil {
   260  		return false, errors.Trace(err)
   261  	}
   262  
   263  	if regionErr != nil {
   264  		err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String()))
   265  		if err != nil {
   266  			return false, errors.Trace(err)
   267  		}
   268  		return false, nil
   269  	}
   270  
   271  	if resp.Resp == nil {
   272  		return false, errors.Trace(tikverr.ErrBodyMissing)
   273  	}
   274  	cmdResp := resp.Resp.(*kvrpcpb.ResolveLockResponse)
   275  	if keyErr := cmdResp.GetError(); keyErr != nil {
   276  		return false, errors.Errorf("unexpected resolve err: %s", keyErr)
   277  	}
   278  
   279  	logutil.BgLogger().Info("BatchResolveLocks: resolve locks in a batch",
   280  		zap.Duration("cost time", time.Since(startTime)),
   281  		zap.Int("num of locks", len(expiredLocks)))
   282  	return true, nil
   283  }
   284  
   285  // ResolveLocks tries to resolve Locks. The resolving process is in 3 steps:
   286  // 1) Use the `lockTTL` to pick up all expired locks. Only locks that are too
   287  //    old are considered orphan locks and will be handled later. If all locks
   288  //    are expired then all locks will be resolved so the returned `ok` will be
   289  //    true, otherwise caller should sleep a while before retry.
   290  // 2) For each lock, query the primary key to get txn(which left the lock)'s
   291  //    commit status.
   292  // 3) Send `ResolveLock` cmd to the lock's region to resolve all locks belong to
   293  //    the same transaction.
   294  func (lr *LockResolver) ResolveLocks(bo *retry.Backoffer, callerStartTS uint64, locks []*Lock) (int64, []uint64 /*pushed*/, error) {
   295  	return lr.resolveLocks(bo, callerStartTS, locks, false, false)
   296  }
   297  
   298  // ResolveLocksLite resolves locks while preventing scan whole region.
   299  func (lr *LockResolver) ResolveLocksLite(bo *retry.Backoffer, callerStartTS uint64, locks []*Lock) (int64, []uint64 /*pushed*/, error) {
   300  	return lr.resolveLocks(bo, callerStartTS, locks, false, true)
   301  }
   302  
   303  func (lr *LockResolver) resolveLocks(bo *retry.Backoffer, callerStartTS uint64, locks []*Lock, forWrite bool, lite bool) (int64, []uint64 /*pushed*/, error) {
   304  	if lr.testingKnobs.meetLock != nil {
   305  		lr.testingKnobs.meetLock(locks)
   306  	}
   307  	var msBeforeTxnExpired txnExpireTime
   308  	if len(locks) == 0 {
   309  		return msBeforeTxnExpired.value(), nil, nil
   310  	}
   311  
   312  	if forWrite {
   313  		metrics.LockResolverCountWithResolveForWrite.Inc()
   314  	} else {
   315  		metrics.LockResolverCountWithResolve.Inc()
   316  	}
   317  
   318  	var pushFail bool
   319  	// TxnID -> []Region, record resolved Regions.
   320  	// TODO: Maybe put it in LockResolver and share by all txns.
   321  	cleanTxns := make(map[uint64]map[locate.RegionVerID]struct{})
   322  	var pushed []uint64
   323  	// pushed is only used in the read operation.
   324  	if !forWrite {
   325  		pushed = make([]uint64, 0, len(locks))
   326  	}
   327  
   328  	var resolve func(*Lock, bool) error
   329  	resolve = func(l *Lock, forceSyncCommit bool) error {
   330  		status, err := lr.getTxnStatusFromLock(bo, l, callerStartTS, forceSyncCommit)
   331  		if err != nil {
   332  			return err
   333  		}
   334  
   335  		if status.ttl == 0 {
   336  			metrics.LockResolverCountWithExpired.Inc()
   337  			// If the lock is committed or rollbacked, resolve lock.
   338  			cleanRegions, exists := cleanTxns[l.TxnID]
   339  			if !exists {
   340  				cleanRegions = make(map[locate.RegionVerID]struct{})
   341  				cleanTxns[l.TxnID] = cleanRegions
   342  			}
   343  
   344  			if status.primaryLock != nil && !forceSyncCommit && status.primaryLock.UseAsyncCommit && !exists {
   345  				err = lr.resolveLockAsync(bo, l, status)
   346  				if _, ok := errors.Cause(err).(*nonAsyncCommitLock); ok {
   347  					err = resolve(l, true)
   348  				}
   349  			} else if l.LockType == kvrpcpb.Op_PessimisticLock {
   350  				err = lr.resolvePessimisticLock(bo, l, cleanRegions)
   351  			} else {
   352  				err = lr.resolveLock(bo, l, status, lite, cleanRegions)
   353  			}
   354  			if err != nil {
   355  				return err
   356  			}
   357  		} else {
   358  			metrics.LockResolverCountWithNotExpired.Inc()
   359  			// If the lock is valid, the txn may be a pessimistic transaction.
   360  			// Update the txn expire time.
   361  			msBeforeLockExpired := lr.store.GetOracle().UntilExpired(l.TxnID, status.ttl, &oracle.Option{TxnScope: oracle.GlobalTxnScope})
   362  			msBeforeTxnExpired.update(msBeforeLockExpired)
   363  			if forWrite {
   364  				// Write conflict detected!
   365  				// If it's a optimistic conflict and current txn is earlier than the lock owner,
   366  				// abort current transaction.
   367  				// This could avoids the deadlock scene of two large transaction.
   368  				if l.LockType != kvrpcpb.Op_PessimisticLock && l.TxnID > callerStartTS {
   369  					metrics.LockResolverCountWithWriteConflict.Inc()
   370  					return tikverr.NewErrWriteConfictWithArgs(callerStartTS, l.TxnID, status.commitTS, l.Key)
   371  				}
   372  			} else {
   373  				if status.action != kvrpcpb.Action_MinCommitTSPushed {
   374  					pushFail = true
   375  					return nil
   376  				}
   377  				pushed = append(pushed, l.TxnID)
   378  			}
   379  		}
   380  		return nil
   381  	}
   382  
   383  	for _, l := range locks {
   384  		err := resolve(l, false)
   385  		if err != nil {
   386  			msBeforeTxnExpired.update(0)
   387  			err = errors.Trace(err)
   388  			return msBeforeTxnExpired.value(), nil, err
   389  		}
   390  	}
   391  	if pushFail {
   392  		// If any of the lock fails to push minCommitTS, don't return the pushed array.
   393  		pushed = nil
   394  	}
   395  
   396  	if msBeforeTxnExpired.value() > 0 && len(pushed) == 0 {
   397  		// If len(pushed) > 0, the caller will not block on the locks, it push the minCommitTS instead.
   398  		metrics.LockResolverCountWithWaitExpired.Inc()
   399  	}
   400  	return msBeforeTxnExpired.value(), pushed, nil
   401  }
   402  
   403  // ResolveLocksForWrite resolves lock for write
   404  func (lr *LockResolver) ResolveLocksForWrite(bo *retry.Backoffer, callerStartTS, callerForUpdateTS uint64, locks []*Lock) (int64, error) {
   405  	// The forWrite parameter is only useful for optimistic transactions which can avoid deadlock between large transactions,
   406  	// so only use forWrite if the callerForUpdateTS is zero.
   407  	msBeforeTxnExpired, _, err := lr.resolveLocks(bo, callerStartTS, locks, callerForUpdateTS == 0, false)
   408  	return msBeforeTxnExpired, err
   409  }
   410  
   411  type txnExpireTime struct {
   412  	initialized bool
   413  	txnExpire   int64
   414  }
   415  
   416  func (t *txnExpireTime) update(lockExpire int64) {
   417  	if lockExpire <= 0 {
   418  		lockExpire = 0
   419  	}
   420  	if !t.initialized {
   421  		t.txnExpire = lockExpire
   422  		t.initialized = true
   423  		return
   424  	}
   425  	if lockExpire < t.txnExpire {
   426  		t.txnExpire = lockExpire
   427  	}
   428  }
   429  
   430  func (t *txnExpireTime) value() int64 {
   431  	if !t.initialized {
   432  		return 0
   433  	}
   434  	return t.txnExpire
   435  }
   436  
   437  const getTxnStatusMaxBackoff = 20000
   438  
   439  // GetTxnStatus queries tikv-server for a txn's status (commit/rollback).
   440  // If the primary key is still locked, it will launch a Rollback to abort it.
   441  // To avoid unnecessarily aborting too many txns, it is wiser to wait a few
   442  // seconds before calling it after Prewrite.
   443  func (lr *LockResolver) GetTxnStatus(txnID uint64, callerStartTS uint64, primary []byte) (TxnStatus, error) {
   444  	var status TxnStatus
   445  	bo := retry.NewBackoffer(context.Background(), getTxnStatusMaxBackoff)
   446  	currentTS, err := lr.store.GetOracle().GetLowResolutionTimestamp(bo.GetCtx(), &oracle.Option{TxnScope: oracle.GlobalTxnScope})
   447  	if err != nil {
   448  		return status, err
   449  	}
   450  	return lr.getTxnStatus(bo, txnID, primary, callerStartTS, currentTS, true, false, nil)
   451  }
   452  
   453  func (lr *LockResolver) getTxnStatusFromLock(bo *retry.Backoffer, l *Lock, callerStartTS uint64, forceSyncCommit bool) (TxnStatus, error) {
   454  	var currentTS uint64
   455  	var err error
   456  	var status TxnStatus
   457  
   458  	if l.TTL == 0 {
   459  		// NOTE: l.TTL = 0 is a special protocol!!!
   460  		// When the pessimistic txn prewrite meets locks of a txn, it should resolve the lock **unconditionally**.
   461  		// In this case, TiKV use lock TTL = 0 to notify TiDB, and TiDB should resolve the lock!
   462  		// Set currentTS to max uint64 to make the lock expired.
   463  		currentTS = math.MaxUint64
   464  	} else {
   465  		currentTS, err = lr.store.GetOracle().GetLowResolutionTimestamp(bo.GetCtx(), &oracle.Option{TxnScope: oracle.GlobalTxnScope})
   466  		if err != nil {
   467  			return TxnStatus{}, err
   468  		}
   469  	}
   470  
   471  	rollbackIfNotExist := false
   472  	if _, err := util.EvalFailpoint("getTxnStatusDelay"); err == nil {
   473  		time.Sleep(100 * time.Millisecond)
   474  	}
   475  	for {
   476  		status, err = lr.getTxnStatus(bo, l.TxnID, l.Primary, callerStartTS, currentTS, rollbackIfNotExist, forceSyncCommit, l)
   477  		if err == nil {
   478  			return status, nil
   479  		}
   480  		// If the error is something other than txnNotFoundErr, throw the error (network
   481  		// unavailable, tikv down, backoff timeout etc) to the caller.
   482  		if _, ok := errors.Cause(err).(txnNotFoundErr); !ok {
   483  			return TxnStatus{}, err
   484  		}
   485  
   486  		if _, err := util.EvalFailpoint("txnNotFoundRetTTL"); err == nil {
   487  			return TxnStatus{ttl: l.TTL, action: kvrpcpb.Action_NoAction}, nil
   488  		}
   489  
   490  		// Handle txnNotFound error.
   491  		// getTxnStatus() returns it when the secondary locks exist while the primary lock doesn't.
   492  		// This is likely to happen in the concurrently prewrite when secondary regions
   493  		// success before the primary region.
   494  		if err := bo.Backoff(retry.BoTxnNotFound, err); err != nil {
   495  			logutil.Logger(bo.GetCtx()).Warn("getTxnStatusFromLock backoff fail", zap.Error(err))
   496  		}
   497  
   498  		if lr.store.GetOracle().UntilExpired(l.TxnID, l.TTL, &oracle.Option{TxnScope: oracle.GlobalTxnScope}) <= 0 {
   499  			logutil.Logger(bo.GetCtx()).Warn("lock txn not found, lock has expired",
   500  				zap.Uint64("CallerStartTs", callerStartTS),
   501  				zap.Stringer("lock str", l))
   502  			if l.LockType == kvrpcpb.Op_PessimisticLock {
   503  				if _, err := util.EvalFailpoint("txnExpireRetTTL"); err == nil {
   504  					return TxnStatus{action: kvrpcpb.Action_LockNotExistDoNothing},
   505  						errors.New("error txn not found and lock expired")
   506  				}
   507  			}
   508  			// For pessimistic lock resolving, if the primary lock does not exist and rollbackIfNotExist is true,
   509  			// The Action_LockNotExistDoNothing will be returned as the status.
   510  			rollbackIfNotExist = true
   511  		} else {
   512  			if l.LockType == kvrpcpb.Op_PessimisticLock {
   513  				return TxnStatus{ttl: l.TTL}, nil
   514  			}
   515  		}
   516  	}
   517  }
   518  
   519  type txnNotFoundErr struct {
   520  	*kvrpcpb.TxnNotFound
   521  }
   522  
   523  func (e txnNotFoundErr) Error() string {
   524  	return e.TxnNotFound.String()
   525  }
   526  
   527  // getTxnStatus sends the CheckTxnStatus request to the TiKV server.
   528  // When rollbackIfNotExist is false, the caller should be careful with the txnNotFoundErr error.
   529  func (lr *LockResolver) getTxnStatus(bo *retry.Backoffer, txnID uint64, primary []byte,
   530  	callerStartTS, currentTS uint64, rollbackIfNotExist bool, forceSyncCommit bool, lockInfo *Lock) (TxnStatus, error) {
   531  	if s, ok := lr.getResolved(txnID); ok {
   532  		return s, nil
   533  	}
   534  
   535  	metrics.LockResolverCountWithQueryTxnStatus.Inc()
   536  
   537  	// CheckTxnStatus may meet the following cases:
   538  	// 1. LOCK
   539  	// 1.1 Lock expired -- orphan lock, fail to update TTL, crash recovery etc.
   540  	// 1.2 Lock TTL -- active transaction holding the lock.
   541  	// 2. NO LOCK
   542  	// 2.1 Txn Committed
   543  	// 2.2 Txn Rollbacked -- rollback itself, rollback by others, GC tomb etc.
   544  	// 2.3 No lock -- pessimistic lock rollback, concurrence prewrite.
   545  
   546  	var status TxnStatus
   547  	resolvingPessimisticLock := lockInfo != nil && lockInfo.LockType == kvrpcpb.Op_PessimisticLock
   548  	req := tikvrpc.NewRequest(tikvrpc.CmdCheckTxnStatus, &kvrpcpb.CheckTxnStatusRequest{
   549  		PrimaryKey:               primary,
   550  		LockTs:                   txnID,
   551  		CallerStartTs:            callerStartTS,
   552  		CurrentTs:                currentTS,
   553  		RollbackIfNotExist:       rollbackIfNotExist,
   554  		ForceSyncCommit:          forceSyncCommit,
   555  		ResolvingPessimisticLock: resolvingPessimisticLock,
   556  	})
   557  	for {
   558  		loc, err := lr.store.GetRegionCache().LocateKey(bo, primary)
   559  		if err != nil {
   560  			return status, errors.Trace(err)
   561  		}
   562  		req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds())
   563  		resp, err := lr.store.SendReq(bo, req, loc.Region, client.ReadTimeoutShort)
   564  		if err != nil {
   565  			return status, errors.Trace(err)
   566  		}
   567  		regionErr, err := resp.GetRegionError()
   568  		if err != nil {
   569  			return status, errors.Trace(err)
   570  		}
   571  		if regionErr != nil {
   572  			err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String()))
   573  			if err != nil {
   574  				return status, errors.Trace(err)
   575  			}
   576  			continue
   577  		}
   578  		if resp.Resp == nil {
   579  			return status, errors.Trace(tikverr.ErrBodyMissing)
   580  		}
   581  		cmdResp := resp.Resp.(*kvrpcpb.CheckTxnStatusResponse)
   582  		if keyErr := cmdResp.GetError(); keyErr != nil {
   583  			txnNotFound := keyErr.GetTxnNotFound()
   584  			if txnNotFound != nil {
   585  				return status, txnNotFoundErr{txnNotFound}
   586  			}
   587  
   588  			err = errors.Errorf("unexpected err: %s, tid: %v", keyErr, txnID)
   589  			logutil.BgLogger().Error("getTxnStatus error", zap.Error(err))
   590  			return status, err
   591  		}
   592  		status.action = cmdResp.Action
   593  		status.primaryLock = cmdResp.LockInfo
   594  
   595  		if status.primaryLock != nil && status.primaryLock.UseAsyncCommit && !forceSyncCommit {
   596  			if !lr.store.GetOracle().IsExpired(txnID, cmdResp.LockTtl, &oracle.Option{TxnScope: oracle.GlobalTxnScope}) {
   597  				status.ttl = cmdResp.LockTtl
   598  			}
   599  		} else if cmdResp.LockTtl != 0 {
   600  			status.ttl = cmdResp.LockTtl
   601  		} else {
   602  			if cmdResp.CommitVersion == 0 {
   603  				metrics.LockResolverCountWithQueryTxnStatusRolledBack.Inc()
   604  			} else {
   605  				metrics.LockResolverCountWithQueryTxnStatusCommitted.Inc()
   606  			}
   607  
   608  			status.commitTS = cmdResp.CommitVersion
   609  			if status.StatusCacheable() {
   610  				lr.saveResolved(txnID, status)
   611  			}
   612  		}
   613  
   614  		return status, nil
   615  	}
   616  }
   617  
   618  // asyncResolveData is data contributed by multiple goroutines when resolving locks using the async commit protocol. All
   619  // data should be protected by the mutex field.
   620  type asyncResolveData struct {
   621  	mutex sync.Mutex
   622  	// If any key has been committed (missingLock is true), then this is the commit ts. In that case, all locks should
   623  	// be committed with the same commit timestamp. If no locks have been committed (missingLock is false), then we will
   624  	// use max(all min commit ts) from all locks; i.e., it is the commit ts we should use. Note that a secondary lock's
   625  	// commit ts may or may not be the same as the primary lock's min commit ts.
   626  	commitTs    uint64
   627  	keys        [][]byte
   628  	missingLock bool
   629  }
   630  
   631  type nonAsyncCommitLock struct{}
   632  
   633  func (*nonAsyncCommitLock) Error() string {
   634  	return "CheckSecondaryLocks receives a non-async-commit lock"
   635  }
   636  
   637  // addKeys adds the keys from locks to data, keeping other fields up to date. startTS and commitTS are for the
   638  // transaction being resolved.
   639  //
   640  // In the async commit protocol when checking locks, we send a list of keys to check and get back a list of locks. There
   641  // will be a lock for every key which is locked. If there are fewer locks than keys, then a lock is missing because it
   642  // has been committed, rolled back, or was never locked.
   643  //
   644  // In this function, locks is the list of locks, and expected is the number of keys. asyncResolveData.missingLock will be
   645  // set to true if the lengths don't match. If the lengths do match, then the locks are added to asyncResolveData.locks
   646  // and will need to be resolved by the caller.
   647  func (data *asyncResolveData) addKeys(locks []*kvrpcpb.LockInfo, expected int, startTS uint64, commitTS uint64) error {
   648  	data.mutex.Lock()
   649  	defer data.mutex.Unlock()
   650  
   651  	// Check locks to see if any have been committed or rolled back.
   652  	if len(locks) < expected {
   653  		logutil.BgLogger().Debug("addKeys: lock has been committed or rolled back", zap.Uint64("commit ts", commitTS), zap.Uint64("start ts", startTS))
   654  		// A lock is missing - the transaction must either have been rolled back or committed.
   655  		if !data.missingLock {
   656  			// commitTS == 0 => lock has been rolled back.
   657  			if commitTS != 0 && commitTS < data.commitTs {
   658  				return errors.Errorf("commit TS must be greater or equal to min commit TS: commit ts: %v, min commit ts: %v", commitTS, data.commitTs)
   659  			}
   660  			data.commitTs = commitTS
   661  		}
   662  		data.missingLock = true
   663  
   664  		if data.commitTs != commitTS {
   665  			return errors.Errorf("commit TS mismatch in async commit recovery: %v and %v", data.commitTs, commitTS)
   666  		}
   667  
   668  		// We do not need to resolve the remaining locks because TiKV will have resolved them as appropriate.
   669  		return nil
   670  	}
   671  
   672  	logutil.BgLogger().Debug("addKeys: all locks present", zap.Uint64("start ts", startTS))
   673  	// Save all locks to be resolved.
   674  	for _, lockInfo := range locks {
   675  		if lockInfo.LockVersion != startTS {
   676  			err := errors.Errorf("unexpected timestamp, expected: %v, found: %v", startTS, lockInfo.LockVersion)
   677  			logutil.BgLogger().Error("addLocks error", zap.Error(err))
   678  			return err
   679  		}
   680  		if !lockInfo.UseAsyncCommit {
   681  			return &nonAsyncCommitLock{}
   682  		}
   683  		if !data.missingLock && lockInfo.MinCommitTs > data.commitTs {
   684  			data.commitTs = lockInfo.MinCommitTs
   685  		}
   686  		data.keys = append(data.keys, lockInfo.Key)
   687  	}
   688  
   689  	return nil
   690  }
   691  
   692  func (lr *LockResolver) checkSecondaries(bo *retry.Backoffer, txnID uint64, curKeys [][]byte, curRegionID locate.RegionVerID, shared *asyncResolveData) error {
   693  	checkReq := &kvrpcpb.CheckSecondaryLocksRequest{
   694  		Keys:         curKeys,
   695  		StartVersion: txnID,
   696  	}
   697  	req := tikvrpc.NewRequest(tikvrpc.CmdCheckSecondaryLocks, checkReq)
   698  	metrics.LockResolverCountWithQueryCheckSecondaryLocks.Inc()
   699  	req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds())
   700  	resp, err := lr.store.SendReq(bo, req, curRegionID, client.ReadTimeoutShort)
   701  	if err != nil {
   702  		return errors.Trace(err)
   703  	}
   704  	regionErr, err := resp.GetRegionError()
   705  	if err != nil {
   706  		return errors.Trace(err)
   707  	}
   708  	if regionErr != nil {
   709  		err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String()))
   710  		if err != nil {
   711  			return errors.Trace(err)
   712  		}
   713  
   714  		logutil.BgLogger().Debug("checkSecondaries: region error, regrouping", zap.Uint64("txn id", txnID), zap.Uint64("region", curRegionID.GetID()))
   715  
   716  		// If regions have changed, then we might need to regroup the keys. Since this should be rare and for the sake
   717  		// of simplicity, we will resolve regions sequentially.
   718  		regions, _, err := lr.store.GetRegionCache().GroupKeysByRegion(bo, curKeys, nil)
   719  		if err != nil {
   720  			return errors.Trace(err)
   721  		}
   722  		for regionID, keys := range regions {
   723  			// Recursion will terminate because the resolve request succeeds or the Backoffer reaches its limit.
   724  			if err = lr.checkSecondaries(bo, txnID, keys, regionID, shared); err != nil {
   725  				return err
   726  			}
   727  		}
   728  		return nil
   729  	}
   730  	if resp.Resp == nil {
   731  		return errors.Trace(tikverr.ErrBodyMissing)
   732  	}
   733  
   734  	checkResp := resp.Resp.(*kvrpcpb.CheckSecondaryLocksResponse)
   735  	return shared.addKeys(checkResp.Locks, len(curKeys), txnID, checkResp.CommitTs)
   736  }
   737  
   738  // resolveLockAsync resolves l assuming it was locked using the async commit protocol.
   739  func (lr *LockResolver) resolveLockAsync(bo *retry.Backoffer, l *Lock, status TxnStatus) error {
   740  	metrics.LockResolverCountWithResolveAsync.Inc()
   741  
   742  	resolveData, err := lr.checkAllSecondaries(bo, l, &status)
   743  	if err != nil {
   744  		return err
   745  	}
   746  
   747  	status.commitTS = resolveData.commitTs
   748  
   749  	resolveData.keys = append(resolveData.keys, l.Primary)
   750  	keysByRegion, _, err := lr.store.GetRegionCache().GroupKeysByRegion(bo, resolveData.keys, nil)
   751  	if err != nil {
   752  		return errors.Trace(err)
   753  	}
   754  
   755  	logutil.BgLogger().Info("resolve async commit", zap.Uint64("startTS", l.TxnID), zap.Uint64("commitTS", status.commitTS))
   756  
   757  	errChan := make(chan error, len(keysByRegion))
   758  	// Resolve every lock in the transaction.
   759  	for region, locks := range keysByRegion {
   760  		curLocks := locks
   761  		curRegion := region
   762  		resolveBo, cancel := bo.Fork()
   763  		defer cancel()
   764  
   765  		go func() {
   766  			errChan <- lr.resolveRegionLocks(resolveBo, l, curRegion, curLocks, status)
   767  		}()
   768  	}
   769  
   770  	var errs []string
   771  	for range keysByRegion {
   772  		err1 := <-errChan
   773  		if err1 != nil {
   774  			errs = append(errs, err1.Error())
   775  		}
   776  	}
   777  
   778  	if len(errs) > 0 {
   779  		return errors.Errorf("async commit recovery (sending ResolveLock) finished with errors: %v", errs)
   780  	}
   781  
   782  	return nil
   783  }
   784  
   785  // checkAllSecondaries checks the secondary locks of an async commit transaction to find out the final
   786  // status of the transaction
   787  func (lr *LockResolver) checkAllSecondaries(bo *retry.Backoffer, l *Lock, status *TxnStatus) (*asyncResolveData, error) {
   788  	regions, _, err := lr.store.GetRegionCache().GroupKeysByRegion(bo, status.primaryLock.Secondaries, nil)
   789  	if err != nil {
   790  		return nil, errors.Trace(err)
   791  	}
   792  
   793  	shared := asyncResolveData{
   794  		mutex:       sync.Mutex{},
   795  		commitTs:    status.primaryLock.MinCommitTs,
   796  		keys:        [][]byte{},
   797  		missingLock: false,
   798  	}
   799  
   800  	errChan := make(chan error, len(regions))
   801  	for regionID, keys := range regions {
   802  		curRegionID := regionID
   803  		curKeys := keys
   804  		checkBo, cancel := bo.Fork()
   805  		defer cancel()
   806  
   807  		go func() {
   808  			errChan <- lr.checkSecondaries(checkBo, l.TxnID, curKeys, curRegionID, &shared)
   809  		}()
   810  	}
   811  
   812  	for range regions {
   813  		err := <-errChan
   814  		if err != nil {
   815  			return nil, err
   816  		}
   817  	}
   818  
   819  	return &shared, nil
   820  }
   821  
   822  // resolveRegionLocks is essentially the same as resolveLock, but we resolve all keys in the same region at the same time.
   823  func (lr *LockResolver) resolveRegionLocks(bo *retry.Backoffer, l *Lock, region locate.RegionVerID, keys [][]byte, status TxnStatus) error {
   824  	lreq := &kvrpcpb.ResolveLockRequest{
   825  		StartVersion: l.TxnID,
   826  	}
   827  	if status.IsCommitted() {
   828  		lreq.CommitVersion = status.CommitTS()
   829  	}
   830  	lreq.Keys = keys
   831  	req := tikvrpc.NewRequest(tikvrpc.CmdResolveLock, lreq)
   832  	req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds())
   833  	resp, err := lr.store.SendReq(bo, req, region, client.ReadTimeoutShort)
   834  	if err != nil {
   835  		return errors.Trace(err)
   836  	}
   837  
   838  	regionErr, err := resp.GetRegionError()
   839  	if err != nil {
   840  		return errors.Trace(err)
   841  	}
   842  	if regionErr != nil {
   843  		err := bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String()))
   844  		if err != nil {
   845  			return errors.Trace(err)
   846  		}
   847  
   848  		logutil.BgLogger().Info("resolveRegionLocks region error, regrouping", zap.String("lock", l.String()), zap.Uint64("region", region.GetID()))
   849  
   850  		// Regroup locks.
   851  		regions, _, err := lr.store.GetRegionCache().GroupKeysByRegion(bo, keys, nil)
   852  		if err != nil {
   853  			return errors.Trace(err)
   854  		}
   855  		for regionID, keys := range regions {
   856  			// Recursion will terminate because the resolve request succeeds or the Backoffer reaches its limit.
   857  			if err = lr.resolveRegionLocks(bo, l, regionID, keys, status); err != nil {
   858  				return err
   859  			}
   860  		}
   861  		return nil
   862  	}
   863  	if resp.Resp == nil {
   864  		return errors.Trace(tikverr.ErrBodyMissing)
   865  	}
   866  	cmdResp := resp.Resp.(*kvrpcpb.ResolveLockResponse)
   867  	if keyErr := cmdResp.GetError(); keyErr != nil {
   868  		err = errors.Errorf("unexpected resolve err: %s, lock: %v", keyErr, l)
   869  		logutil.BgLogger().Error("resolveLock error", zap.Error(err))
   870  	}
   871  
   872  	return nil
   873  }
   874  
   875  func (lr *LockResolver) resolveLock(bo *retry.Backoffer, l *Lock, status TxnStatus, lite bool, cleanRegions map[locate.RegionVerID]struct{}) error {
   876  	metrics.LockResolverCountWithResolveLocks.Inc()
   877  	resolveLite := lite || l.TxnSize < lr.resolveLockLiteThreshold
   878  	for {
   879  		loc, err := lr.store.GetRegionCache().LocateKey(bo, l.Key)
   880  		if err != nil {
   881  			return errors.Trace(err)
   882  		}
   883  		if _, ok := cleanRegions[loc.Region]; ok {
   884  			return nil
   885  		}
   886  		lreq := &kvrpcpb.ResolveLockRequest{
   887  			StartVersion: l.TxnID,
   888  		}
   889  		if status.IsCommitted() {
   890  			lreq.CommitVersion = status.CommitTS()
   891  		} else {
   892  			logutil.BgLogger().Info("resolveLock rollback", zap.String("lock", l.String()))
   893  		}
   894  
   895  		if resolveLite {
   896  			// Only resolve specified keys when it is a small transaction,
   897  			// prevent from scanning the whole region in this case.
   898  			metrics.LockResolverCountWithResolveLockLite.Inc()
   899  			lreq.Keys = [][]byte{l.Key}
   900  		}
   901  		req := tikvrpc.NewRequest(tikvrpc.CmdResolveLock, lreq)
   902  		req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds())
   903  		resp, err := lr.store.SendReq(bo, req, loc.Region, client.ReadTimeoutShort)
   904  		if err != nil {
   905  			return errors.Trace(err)
   906  		}
   907  		regionErr, err := resp.GetRegionError()
   908  		if err != nil {
   909  			return errors.Trace(err)
   910  		}
   911  		if regionErr != nil {
   912  			err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String()))
   913  			if err != nil {
   914  				return errors.Trace(err)
   915  			}
   916  			continue
   917  		}
   918  		if resp.Resp == nil {
   919  			return errors.Trace(tikverr.ErrBodyMissing)
   920  		}
   921  		cmdResp := resp.Resp.(*kvrpcpb.ResolveLockResponse)
   922  		if keyErr := cmdResp.GetError(); keyErr != nil {
   923  			err = errors.Errorf("unexpected resolve err: %s, lock: %v", keyErr, l)
   924  			logutil.BgLogger().Error("resolveLock error", zap.Error(err))
   925  			return err
   926  		}
   927  		if !resolveLite {
   928  			cleanRegions[loc.Region] = struct{}{}
   929  		}
   930  		return nil
   931  	}
   932  }
   933  
   934  func (lr *LockResolver) resolvePessimisticLock(bo *retry.Backoffer, l *Lock, cleanRegions map[locate.RegionVerID]struct{}) error {
   935  	metrics.LockResolverCountWithResolveLocks.Inc()
   936  	for {
   937  		loc, err := lr.store.GetRegionCache().LocateKey(bo, l.Key)
   938  		if err != nil {
   939  			return errors.Trace(err)
   940  		}
   941  		if _, ok := cleanRegions[loc.Region]; ok {
   942  			return nil
   943  		}
   944  		forUpdateTS := l.LockForUpdateTS
   945  		if forUpdateTS == 0 {
   946  			forUpdateTS = math.MaxUint64
   947  		}
   948  		pessimisticRollbackReq := &kvrpcpb.PessimisticRollbackRequest{
   949  			StartVersion: l.TxnID,
   950  			ForUpdateTs:  forUpdateTS,
   951  			Keys:         [][]byte{l.Key},
   952  		}
   953  		req := tikvrpc.NewRequest(tikvrpc.CmdPessimisticRollback, pessimisticRollbackReq)
   954  		req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds())
   955  		resp, err := lr.store.SendReq(bo, req, loc.Region, client.ReadTimeoutShort)
   956  		if err != nil {
   957  			return errors.Trace(err)
   958  		}
   959  		regionErr, err := resp.GetRegionError()
   960  		if err != nil {
   961  			return errors.Trace(err)
   962  		}
   963  		if regionErr != nil {
   964  			err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String()))
   965  			if err != nil {
   966  				return errors.Trace(err)
   967  			}
   968  			continue
   969  		}
   970  		if resp.Resp == nil {
   971  			return errors.Trace(tikverr.ErrBodyMissing)
   972  		}
   973  		cmdResp := resp.Resp.(*kvrpcpb.PessimisticRollbackResponse)
   974  		if keyErr := cmdResp.GetErrors(); len(keyErr) > 0 {
   975  			err = errors.Errorf("unexpected resolve pessimistic lock err: %s, lock: %v", keyErr[0], l)
   976  			logutil.Logger(bo.GetCtx()).Error("resolveLock error", zap.Error(err))
   977  			return err
   978  		}
   979  		return nil
   980  	}
   981  }