github.com/KinWaiYuen/client-go/v2@v2.5.4/txnkv/transaction/2pc.go (about)

     1  // Copyright 2021 TiKV Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // NOTE: The code in this file is based on code from the
    16  // TiDB project, licensed under the Apache License v 2.0
    17  //
    18  // https://github.com/pingcap/tidb/tree/cc5e161ac06827589c4966674597c137cc9e809c/store/tikv/2pc.go
    19  //
    20  
    21  // Copyright 2016 PingCAP, Inc.
    22  //
    23  // Licensed under the Apache License, Version 2.0 (the "License");
    24  // you may not use this file except in compliance with the License.
    25  // You may obtain a copy of the License at
    26  //
    27  //     http://www.apache.org/licenses/LICENSE-2.0
    28  //
    29  // Unless required by applicable law or agreed to in writing, software
    30  // distributed under the License is distributed on an "AS IS" BASIS,
    31  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    32  // See the License for the specific language governing permissions and
    33  // limitations under the License.
    34  
    35  package transaction
    36  
    37  import (
    38  	"bytes"
    39  	"context"
    40  	"encoding/hex"
    41  	"math"
    42  	"math/rand"
    43  	"strings"
    44  	"sync"
    45  	"sync/atomic"
    46  	"time"
    47  	"unsafe"
    48  
    49  	"github.com/KinWaiYuen/client-go/v2/config"
    50  	tikverr "github.com/KinWaiYuen/client-go/v2/error"
    51  	"github.com/KinWaiYuen/client-go/v2/internal/client"
    52  	"github.com/KinWaiYuen/client-go/v2/internal/latch"
    53  	"github.com/KinWaiYuen/client-go/v2/internal/locate"
    54  	"github.com/KinWaiYuen/client-go/v2/internal/logutil"
    55  	"github.com/KinWaiYuen/client-go/v2/internal/retry"
    56  	"github.com/KinWaiYuen/client-go/v2/internal/unionstore"
    57  	"github.com/KinWaiYuen/client-go/v2/kv"
    58  	"github.com/KinWaiYuen/client-go/v2/metrics"
    59  	"github.com/KinWaiYuen/client-go/v2/oracle"
    60  	"github.com/KinWaiYuen/client-go/v2/tikvrpc"
    61  	"github.com/KinWaiYuen/client-go/v2/txnkv/txnlock"
    62  	"github.com/KinWaiYuen/client-go/v2/util"
    63  	"github.com/pingcap/errors"
    64  	"github.com/pingcap/kvproto/pkg/kvrpcpb"
    65  	"github.com/pingcap/parser/terror"
    66  	"github.com/prometheus/client_golang/prometheus"
    67  	zap "go.uber.org/zap"
    68  )
    69  
    70  // If the duration of a single request exceeds the slowRequestThreshold, a warning log will be logged.
    71  const slowRequestThreshold = time.Minute
    72  
    73  type twoPhaseCommitAction interface {
    74  	handleSingleBatch(*twoPhaseCommitter, *retry.Backoffer, batchMutations) error
    75  	tiKVTxnRegionsNumHistogram() prometheus.Observer
    76  	String() string
    77  }
    78  
    79  // Global variable set by config file.
    80  var (
    81  	ManagedLockTTL uint64 = 20000 // 20s
    82  )
    83  
    84  var (
    85  	// PrewriteMaxBackoff is max sleep time of the `pre-write` command.
    86  	PrewriteMaxBackoff = 40000
    87  	// CommitMaxBackoff is max sleep time of the 'commit' command
    88  	CommitMaxBackoff = uint64(40000)
    89  )
    90  
    91  type kvstore interface {
    92  	// GetRegionCache gets the RegionCache.
    93  	GetRegionCache() *locate.RegionCache
    94  	// SplitRegions splits regions by splitKeys.
    95  	SplitRegions(ctx context.Context, splitKeys [][]byte, scatter bool, tableID *int64) (regionIDs []uint64, err error)
    96  	// WaitScatterRegionFinish implements SplittableStore interface.
    97  	// backOff is the back off time of the wait scatter region.(Milliseconds)
    98  	// if backOff <= 0, the default wait scatter back off time will be used.
    99  	WaitScatterRegionFinish(ctx context.Context, regionID uint64, backOff int) error
   100  
   101  	// GetTimestampWithRetry returns latest timestamp.
   102  	GetTimestampWithRetry(bo *retry.Backoffer, scope string) (uint64, error)
   103  	// GetOracle gets a timestamp oracle client.
   104  	GetOracle() oracle.Oracle
   105  	CurrentTimestamp(txnScope string) (uint64, error)
   106  	// SendReq sends a request to TiKV.
   107  	SendReq(bo *retry.Backoffer, req *tikvrpc.Request, regionID locate.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error)
   108  	// GetTiKVClient gets the client instance.
   109  	GetTiKVClient() (client client.Client)
   110  	GetLockResolver() *txnlock.LockResolver
   111  	Ctx() context.Context
   112  	WaitGroup() *sync.WaitGroup
   113  	// TxnLatches returns txnLatches.
   114  	TxnLatches() *latch.LatchesScheduler
   115  	GetClusterID() uint64
   116  }
   117  
   118  // twoPhaseCommitter executes a two-phase commit protocol.
   119  type twoPhaseCommitter struct {
   120  	store               kvstore
   121  	txn                 *KVTxn
   122  	startTS             uint64
   123  	mutations           *memBufferMutations
   124  	lockTTL             uint64
   125  	commitTS            uint64
   126  	priority            kvrpcpb.CommandPri
   127  	sessionID           uint64 // sessionID is used for log.
   128  	cleanWg             sync.WaitGroup
   129  	detail              unsafe.Pointer
   130  	txnSize             int
   131  	hasNoNeedCommitKeys bool
   132  
   133  	primaryKey  []byte
   134  	forUpdateTS uint64
   135  
   136  	mu struct {
   137  		sync.RWMutex
   138  		undeterminedErr error // undeterminedErr saves the rpc error we encounter when commit primary key.
   139  		committed       bool
   140  	}
   141  	syncLog bool
   142  	// For pessimistic transaction
   143  	isPessimistic bool
   144  	isFirstLock   bool
   145  	// regionTxnSize stores the number of keys involved in each region
   146  	regionTxnSize map[uint64]int
   147  	// Used by pessimistic transaction and large transaction.
   148  	ttlManager
   149  
   150  	testingKnobs struct {
   151  		acAfterCommitPrimary chan struct{}
   152  		bkAfterCommitPrimary chan struct{}
   153  		noFallBack           bool
   154  	}
   155  
   156  	useAsyncCommit    uint32
   157  	minCommitTS       uint64
   158  	maxCommitTS       uint64
   159  	prewriteStarted   bool
   160  	prewriteCancelled uint32
   161  	useOnePC          uint32
   162  	onePCCommitTS     uint64
   163  
   164  	hasTriedAsyncCommit bool
   165  	hasTriedOnePC       bool
   166  
   167  	// doingAmend means the amend prewrite is ongoing.
   168  	doingAmend bool
   169  
   170  	binlog BinlogExecutor
   171  
   172  	resourceGroupTag []byte
   173  
   174  	// allowed when tikv disk full happened.
   175  	diskFullOpt kvrpcpb.DiskFullOpt
   176  }
   177  
   178  type memBufferMutations struct {
   179  	storage *unionstore.MemDB
   180  	handles []unionstore.MemKeyHandle
   181  }
   182  
   183  func newMemBufferMutations(sizeHint int, storage *unionstore.MemDB) *memBufferMutations {
   184  	return &memBufferMutations{
   185  		handles: make([]unionstore.MemKeyHandle, 0, sizeHint),
   186  		storage: storage,
   187  	}
   188  }
   189  
   190  func (m *memBufferMutations) Len() int {
   191  	return len(m.handles)
   192  }
   193  
   194  func (m *memBufferMutations) GetKey(i int) []byte {
   195  	return m.storage.GetKeyByHandle(m.handles[i])
   196  }
   197  
   198  func (m *memBufferMutations) GetKeys() [][]byte {
   199  	ret := make([][]byte, m.Len())
   200  	for i := range ret {
   201  		ret[i] = m.GetKey(i)
   202  	}
   203  	return ret
   204  }
   205  
   206  func (m *memBufferMutations) GetValue(i int) []byte {
   207  	v, _ := m.storage.GetValueByHandle(m.handles[i])
   208  	return v
   209  }
   210  
   211  func (m *memBufferMutations) GetOp(i int) kvrpcpb.Op {
   212  	return kvrpcpb.Op(m.handles[i].UserData >> 1)
   213  }
   214  
   215  func (m *memBufferMutations) IsPessimisticLock(i int) bool {
   216  	return m.handles[i].UserData&1 != 0
   217  }
   218  
   219  func (m *memBufferMutations) Slice(from, to int) CommitterMutations {
   220  	return &memBufferMutations{
   221  		handles: m.handles[from:to],
   222  		storage: m.storage,
   223  	}
   224  }
   225  
   226  func (m *memBufferMutations) Push(op kvrpcpb.Op, isPessimisticLock bool, handle unionstore.MemKeyHandle) {
   227  	aux := uint16(op) << 1
   228  	if isPessimisticLock {
   229  		aux |= 1
   230  	}
   231  	handle.UserData = aux
   232  	m.handles = append(m.handles, handle)
   233  }
   234  
   235  // CommitterMutations contains the mutations to be submitted.
   236  type CommitterMutations interface {
   237  	Len() int
   238  	GetKey(i int) []byte
   239  	GetKeys() [][]byte
   240  	GetOp(i int) kvrpcpb.Op
   241  	GetValue(i int) []byte
   242  	IsPessimisticLock(i int) bool
   243  	Slice(from, to int) CommitterMutations
   244  }
   245  
   246  // PlainMutations contains transaction operations.
   247  type PlainMutations struct {
   248  	ops               []kvrpcpb.Op
   249  	keys              [][]byte
   250  	values            [][]byte
   251  	isPessimisticLock []bool
   252  }
   253  
   254  // NewPlainMutations creates a PlainMutations object with sizeHint reserved.
   255  func NewPlainMutations(sizeHint int) PlainMutations {
   256  	return PlainMutations{
   257  		ops:               make([]kvrpcpb.Op, 0, sizeHint),
   258  		keys:              make([][]byte, 0, sizeHint),
   259  		values:            make([][]byte, 0, sizeHint),
   260  		isPessimisticLock: make([]bool, 0, sizeHint),
   261  	}
   262  }
   263  
   264  // Slice return a sub mutations in range [from, to).
   265  func (c *PlainMutations) Slice(from, to int) CommitterMutations {
   266  	var res PlainMutations
   267  	res.keys = c.keys[from:to]
   268  	if c.ops != nil {
   269  		res.ops = c.ops[from:to]
   270  	}
   271  	if c.values != nil {
   272  		res.values = c.values[from:to]
   273  	}
   274  	if c.isPessimisticLock != nil {
   275  		res.isPessimisticLock = c.isPessimisticLock[from:to]
   276  	}
   277  	return &res
   278  }
   279  
   280  // Push another mutation into mutations.
   281  func (c *PlainMutations) Push(op kvrpcpb.Op, key []byte, value []byte, isPessimisticLock bool) {
   282  	c.ops = append(c.ops, op)
   283  	c.keys = append(c.keys, key)
   284  	c.values = append(c.values, value)
   285  	c.isPessimisticLock = append(c.isPessimisticLock, isPessimisticLock)
   286  }
   287  
   288  // Len returns the count of mutations.
   289  func (c *PlainMutations) Len() int {
   290  	return len(c.keys)
   291  }
   292  
   293  // GetKey returns the key at index.
   294  func (c *PlainMutations) GetKey(i int) []byte {
   295  	return c.keys[i]
   296  }
   297  
   298  // GetKeys returns the keys.
   299  func (c *PlainMutations) GetKeys() [][]byte {
   300  	return c.keys
   301  }
   302  
   303  // GetOps returns the key ops.
   304  func (c *PlainMutations) GetOps() []kvrpcpb.Op {
   305  	return c.ops
   306  }
   307  
   308  // GetValues returns the key values.
   309  func (c *PlainMutations) GetValues() [][]byte {
   310  	return c.values
   311  }
   312  
   313  // GetPessimisticFlags returns the key pessimistic flags.
   314  func (c *PlainMutations) GetPessimisticFlags() []bool {
   315  	return c.isPessimisticLock
   316  }
   317  
   318  // GetOp returns the key op at index.
   319  func (c *PlainMutations) GetOp(i int) kvrpcpb.Op {
   320  	return c.ops[i]
   321  }
   322  
   323  // GetValue returns the key value at index.
   324  func (c *PlainMutations) GetValue(i int) []byte {
   325  	if len(c.values) <= i {
   326  		return nil
   327  	}
   328  	return c.values[i]
   329  }
   330  
   331  // IsPessimisticLock returns the key pessimistic flag at index.
   332  func (c *PlainMutations) IsPessimisticLock(i int) bool {
   333  	return c.isPessimisticLock[i]
   334  }
   335  
   336  // PlainMutation represents a single transaction operation.
   337  type PlainMutation struct {
   338  	KeyOp             kvrpcpb.Op
   339  	Key               []byte
   340  	Value             []byte
   341  	IsPessimisticLock bool
   342  }
   343  
   344  // MergeMutations append input mutations into current mutations.
   345  func (c *PlainMutations) MergeMutations(mutations PlainMutations) {
   346  	c.ops = append(c.ops, mutations.ops...)
   347  	c.keys = append(c.keys, mutations.keys...)
   348  	c.values = append(c.values, mutations.values...)
   349  	c.isPessimisticLock = append(c.isPessimisticLock, mutations.isPessimisticLock...)
   350  }
   351  
   352  // AppendMutation merges a single Mutation into the current mutations.
   353  func (c *PlainMutations) AppendMutation(mutation PlainMutation) {
   354  	c.ops = append(c.ops, mutation.KeyOp)
   355  	c.keys = append(c.keys, mutation.Key)
   356  	c.values = append(c.values, mutation.Value)
   357  	c.isPessimisticLock = append(c.isPessimisticLock, mutation.IsPessimisticLock)
   358  }
   359  
   360  // newTwoPhaseCommitter creates a twoPhaseCommitter.
   361  func newTwoPhaseCommitter(txn *KVTxn, sessionID uint64) (*twoPhaseCommitter, error) {
   362  	return &twoPhaseCommitter{
   363  		store:         txn.store,
   364  		txn:           txn,
   365  		startTS:       txn.StartTS(),
   366  		sessionID:     sessionID,
   367  		regionTxnSize: map[uint64]int{},
   368  		isPessimistic: txn.IsPessimistic(),
   369  		binlog:        txn.binlog,
   370  		diskFullOpt:   kvrpcpb.DiskFullOpt_NotAllowedOnFull,
   371  	}, nil
   372  }
   373  
   374  func (c *twoPhaseCommitter) extractKeyExistsErr(err *tikverr.ErrKeyExist) error {
   375  	if !c.txn.us.HasPresumeKeyNotExists(err.GetKey()) {
   376  		return errors.Errorf("session %d, existErr for key:%s should not be nil", c.sessionID, err.GetKey())
   377  	}
   378  	return errors.Trace(err)
   379  }
   380  
   381  // KVFilter is a filter that filters out unnecessary KV pairs.
   382  type KVFilter interface {
   383  	// IsUnnecessaryKeyValue returns whether this KV pair should be committed.
   384  	IsUnnecessaryKeyValue(key, value []byte, flags kv.KeyFlags) bool
   385  }
   386  
   387  func (c *twoPhaseCommitter) initKeysAndMutations() error {
   388  	var size, putCnt, delCnt, lockCnt, checkCnt int
   389  
   390  	txn := c.txn
   391  	memBuf := txn.GetMemBuffer()
   392  	sizeHint := txn.us.GetMemBuffer().Len()
   393  	c.mutations = newMemBufferMutations(sizeHint, memBuf)
   394  	c.isPessimistic = txn.IsPessimistic()
   395  	filter := txn.kvFilter
   396  
   397  	var err error
   398  	for it := memBuf.IterWithFlags(nil, nil); it.Valid(); err = it.Next() {
   399  		_ = err
   400  		key := it.Key()
   401  		flags := it.Flags()
   402  		var value []byte
   403  		var op kvrpcpb.Op
   404  
   405  		if !it.HasValue() {
   406  			if !flags.HasLocked() {
   407  				continue
   408  			}
   409  			op = kvrpcpb.Op_Lock
   410  			lockCnt++
   411  		} else {
   412  			value = it.Value()
   413  			isUnnecessaryKV := filter != nil && filter.IsUnnecessaryKeyValue(key, value, flags)
   414  			if len(value) > 0 {
   415  				if isUnnecessaryKV {
   416  					if !flags.HasLocked() {
   417  						continue
   418  					}
   419  					// If the key was locked before, we should prewrite the lock even if
   420  					// the KV needn't be committed according to the filter. Otherwise, we
   421  					// were forgetting removing pessimistic locks added before.
   422  					op = kvrpcpb.Op_Lock
   423  					lockCnt++
   424  				} else {
   425  					op = kvrpcpb.Op_Put
   426  					if flags.HasPresumeKeyNotExists() {
   427  						op = kvrpcpb.Op_Insert
   428  					}
   429  					putCnt++
   430  				}
   431  			} else {
   432  				if isUnnecessaryKV {
   433  					continue
   434  				}
   435  				if !txn.IsPessimistic() && flags.HasPresumeKeyNotExists() {
   436  					// delete-your-writes keys in optimistic txn need check not exists in prewrite-phase
   437  					// due to `Op_CheckNotExists` doesn't prewrite lock, so mark those keys should not be used in commit-phase.
   438  					op = kvrpcpb.Op_CheckNotExists
   439  					checkCnt++
   440  					memBuf.UpdateFlags(key, kv.SetPrewriteOnly)
   441  				} else {
   442  					// normal delete keys in optimistic txn can be delete without not exists checking
   443  					// delete-your-writes keys in pessimistic txn can ensure must be no exists so can directly delete them
   444  					op = kvrpcpb.Op_Del
   445  					delCnt++
   446  				}
   447  			}
   448  		}
   449  
   450  		var isPessimistic bool
   451  		if flags.HasLocked() {
   452  			isPessimistic = c.isPessimistic
   453  		}
   454  		c.mutations.Push(op, isPessimistic, it.Handle())
   455  		size += len(key) + len(value)
   456  
   457  		if len(c.primaryKey) == 0 && op != kvrpcpb.Op_CheckNotExists {
   458  			c.primaryKey = key
   459  		}
   460  	}
   461  
   462  	if c.mutations.Len() == 0 {
   463  		return nil
   464  	}
   465  	c.txnSize = size
   466  
   467  	const logEntryCount = 10000
   468  	const logSize = 4 * 1024 * 1024 // 4MB
   469  	if c.mutations.Len() > logEntryCount || size > logSize {
   470  		logutil.BgLogger().Info("[BIG_TXN]",
   471  			zap.Uint64("session", c.sessionID),
   472  			zap.String("key sample", kv.StrKey(c.mutations.GetKey(0))),
   473  			zap.Int("size", size),
   474  			zap.Int("keys", c.mutations.Len()),
   475  			zap.Int("puts", putCnt),
   476  			zap.Int("dels", delCnt),
   477  			zap.Int("locks", lockCnt),
   478  			zap.Int("checks", checkCnt),
   479  			zap.Uint64("txnStartTS", txn.startTS))
   480  	}
   481  
   482  	// Sanity check for startTS.
   483  	if txn.StartTS() == math.MaxUint64 {
   484  		err = errors.Errorf("try to commit with invalid txnStartTS: %d", txn.StartTS())
   485  		logutil.BgLogger().Error("commit failed",
   486  			zap.Uint64("session", c.sessionID),
   487  			zap.Error(err))
   488  		return errors.Trace(err)
   489  	}
   490  
   491  	commitDetail := &util.CommitDetails{WriteSize: size, WriteKeys: c.mutations.Len()}
   492  	metrics.TiKVTxnWriteKVCountHistogram.Observe(float64(commitDetail.WriteKeys))
   493  	metrics.TiKVTxnWriteSizeHistogram.Observe(float64(commitDetail.WriteSize))
   494  	c.hasNoNeedCommitKeys = checkCnt > 0
   495  	c.lockTTL = txnLockTTL(txn.startTime, size)
   496  	c.priority = txn.priority.ToPB()
   497  	c.syncLog = txn.syncLog
   498  	c.resourceGroupTag = txn.resourceGroupTag
   499  	c.setDetail(commitDetail)
   500  	return nil
   501  }
   502  
   503  func (c *twoPhaseCommitter) primary() []byte {
   504  	if len(c.primaryKey) == 0 {
   505  		return c.mutations.GetKey(0)
   506  	}
   507  	return c.primaryKey
   508  }
   509  
   510  // asyncSecondaries returns all keys that must be checked in the recovery phase of an async commit.
   511  func (c *twoPhaseCommitter) asyncSecondaries() [][]byte {
   512  	secondaries := make([][]byte, 0, c.mutations.Len())
   513  	for i := 0; i < c.mutations.Len(); i++ {
   514  		k := c.mutations.GetKey(i)
   515  		if bytes.Equal(k, c.primary()) || c.mutations.GetOp(i) == kvrpcpb.Op_CheckNotExists {
   516  			continue
   517  		}
   518  		secondaries = append(secondaries, k)
   519  	}
   520  	return secondaries
   521  }
   522  
   523  const bytesPerMiB = 1024 * 1024
   524  
   525  // ttl = ttlFactor * sqrt(writeSizeInMiB)
   526  var ttlFactor = 6000
   527  
   528  // By default, locks after 3000ms is considered unusual (the client created the
   529  // lock might be dead). Other client may cleanup this kind of lock.
   530  // For locks created recently, we will do backoff and retry.
   531  var defaultLockTTL uint64 = 3000
   532  
   533  func txnLockTTL(startTime time.Time, txnSize int) uint64 {
   534  	// Increase lockTTL for large transactions.
   535  	// The formula is `ttl = ttlFactor * sqrt(sizeInMiB)`.
   536  	// When writeSize is less than 256KB, the base ttl is defaultTTL (3s);
   537  	// When writeSize is 1MiB, 4MiB, or 10MiB, ttl is 6s, 12s, 20s correspondingly;
   538  	lockTTL := defaultLockTTL
   539  	if txnSize >= txnCommitBatchSize {
   540  		sizeMiB := float64(txnSize) / bytesPerMiB
   541  		lockTTL = uint64(float64(ttlFactor) * math.Sqrt(sizeMiB))
   542  		if lockTTL < defaultLockTTL {
   543  			lockTTL = defaultLockTTL
   544  		}
   545  		if lockTTL > ManagedLockTTL {
   546  			lockTTL = ManagedLockTTL
   547  		}
   548  	}
   549  
   550  	// Increase lockTTL by the transaction's read time.
   551  	// When resolving a lock, we compare current ts and startTS+lockTTL to decide whether to clean up. If a txn
   552  	// takes a long time to read, increasing its TTL will help to prevent it from been aborted soon after prewrite.
   553  	elapsed := time.Since(startTime) / time.Millisecond
   554  	return lockTTL + uint64(elapsed)
   555  }
   556  
   557  var preSplitDetectThreshold uint32 = 100000
   558  var preSplitSizeThreshold uint32 = 32 << 20
   559  
   560  // doActionOnMutations groups keys into primary batch and secondary batches, if primary batch exists in the key,
   561  // it does action on primary batch first, then on secondary batches. If action is commit, secondary batches
   562  // is done in background goroutine.
   563  func (c *twoPhaseCommitter) doActionOnMutations(bo *retry.Backoffer, action twoPhaseCommitAction, mutations CommitterMutations) error {
   564  	if mutations.Len() == 0 {
   565  		return nil
   566  	}
   567  	groups, err := c.groupMutations(bo, mutations)
   568  	if err != nil {
   569  		return errors.Trace(err)
   570  	}
   571  
   572  	// This is redundant since `doActionOnGroupMutations` will still split groups into batches and
   573  	// check the number of batches. However we don't want the check fail after any code changes.
   574  	c.checkOnePCFallBack(action, len(groups))
   575  
   576  	return c.doActionOnGroupMutations(bo, action, groups)
   577  }
   578  
   579  type groupedMutations struct {
   580  	region    locate.RegionVerID
   581  	mutations CommitterMutations
   582  }
   583  
   584  // groupSortedMutationsByRegion separates keys into groups by their belonging Regions.
   585  func groupSortedMutationsByRegion(c *locate.RegionCache, bo *retry.Backoffer, m CommitterMutations) ([]groupedMutations, error) {
   586  	var (
   587  		groups  []groupedMutations
   588  		lastLoc *locate.KeyLocation
   589  	)
   590  	lastUpperBound := 0
   591  	for i := 0; i < m.Len(); i++ {
   592  		if lastLoc == nil || !lastLoc.Contains(m.GetKey(i)) {
   593  			if lastLoc != nil {
   594  				groups = append(groups, groupedMutations{
   595  					region:    lastLoc.Region,
   596  					mutations: m.Slice(lastUpperBound, i),
   597  				})
   598  				lastUpperBound = i
   599  			}
   600  			var err error
   601  			lastLoc, err = c.LocateKey(bo, m.GetKey(i))
   602  			if err != nil {
   603  				return nil, errors.Trace(err)
   604  			}
   605  		}
   606  	}
   607  	if lastLoc != nil {
   608  		groups = append(groups, groupedMutations{
   609  			region:    lastLoc.Region,
   610  			mutations: m.Slice(lastUpperBound, m.Len()),
   611  		})
   612  	}
   613  	return groups, nil
   614  }
   615  
   616  // groupMutations groups mutations by region, then checks for any large groups and in that case pre-splits the region.
   617  func (c *twoPhaseCommitter) groupMutations(bo *retry.Backoffer, mutations CommitterMutations) ([]groupedMutations, error) {
   618  	groups, err := groupSortedMutationsByRegion(c.store.GetRegionCache(), bo, mutations)
   619  	if err != nil {
   620  		return nil, errors.Trace(err)
   621  	}
   622  
   623  	// Pre-split regions to avoid too much write workload into a single region.
   624  	// In the large transaction case, this operation is important to avoid TiKV 'server is busy' error.
   625  	var didPreSplit bool
   626  	preSplitDetectThresholdVal := atomic.LoadUint32(&preSplitDetectThreshold)
   627  	for _, group := range groups {
   628  		if uint32(group.mutations.Len()) >= preSplitDetectThresholdVal {
   629  			logutil.BgLogger().Info("2PC detect large amount of mutations on a single region",
   630  				zap.Uint64("region", group.region.GetID()),
   631  				zap.Int("mutations count", group.mutations.Len()))
   632  			if c.preSplitRegion(bo.GetCtx(), group) {
   633  				didPreSplit = true
   634  			}
   635  		}
   636  	}
   637  	// Reload region cache again.
   638  	if didPreSplit {
   639  		groups, err = groupSortedMutationsByRegion(c.store.GetRegionCache(), bo, mutations)
   640  		if err != nil {
   641  			return nil, errors.Trace(err)
   642  		}
   643  	}
   644  
   645  	return groups, nil
   646  }
   647  
   648  func (c *twoPhaseCommitter) preSplitRegion(ctx context.Context, group groupedMutations) bool {
   649  	splitKeys := make([][]byte, 0, 4)
   650  
   651  	preSplitSizeThresholdVal := atomic.LoadUint32(&preSplitSizeThreshold)
   652  	regionSize := 0
   653  	keysLength := group.mutations.Len()
   654  	// The value length maybe zero for pessimistic lock keys
   655  	for i := 0; i < keysLength; i++ {
   656  		regionSize = regionSize + len(group.mutations.GetKey(i)) + len(group.mutations.GetValue(i))
   657  		// The second condition is used for testing.
   658  		if regionSize >= int(preSplitSizeThresholdVal) {
   659  			regionSize = 0
   660  			splitKeys = append(splitKeys, group.mutations.GetKey(i))
   661  		}
   662  	}
   663  	if len(splitKeys) == 0 {
   664  		return false
   665  	}
   666  
   667  	regionIDs, err := c.store.SplitRegions(ctx, splitKeys, true, nil)
   668  	if err != nil {
   669  		logutil.BgLogger().Warn("2PC split regions failed", zap.Uint64("regionID", group.region.GetID()),
   670  			zap.Int("keys count", keysLength), zap.Error(err))
   671  		return false
   672  	}
   673  
   674  	for _, regionID := range regionIDs {
   675  		err := c.store.WaitScatterRegionFinish(ctx, regionID, 0)
   676  		if err != nil {
   677  			logutil.BgLogger().Warn("2PC wait scatter region failed", zap.Uint64("regionID", regionID), zap.Error(err))
   678  		}
   679  	}
   680  	// Invalidate the old region cache information.
   681  	c.store.GetRegionCache().InvalidateCachedRegion(group.region)
   682  	return true
   683  }
   684  
   685  // CommitSecondaryMaxBackoff is max sleep time of the 'commit' command
   686  const CommitSecondaryMaxBackoff = 41000
   687  
   688  // doActionOnGroupedMutations splits groups into batches (there is one group per region, and potentially many batches per group, but all mutations
   689  // in a batch will belong to the same region).
   690  func (c *twoPhaseCommitter) doActionOnGroupMutations(bo *retry.Backoffer, action twoPhaseCommitAction, groups []groupedMutations) error {
   691  	action.tiKVTxnRegionsNumHistogram().Observe(float64(len(groups)))
   692  
   693  	var sizeFunc = c.keySize
   694  
   695  	switch act := action.(type) {
   696  	case actionPrewrite:
   697  		// Do not update regionTxnSize on retries. They are not used when building a PrewriteRequest.
   698  		if !act.retry {
   699  			for _, group := range groups {
   700  				c.regionTxnSize[group.region.GetID()] = group.mutations.Len()
   701  			}
   702  		}
   703  		sizeFunc = c.keyValueSize
   704  		atomic.AddInt32(&c.getDetail().PrewriteRegionNum, int32(len(groups)))
   705  	case actionPessimisticLock:
   706  		if act.LockCtx.Stats != nil {
   707  			act.LockCtx.Stats.RegionNum = int32(len(groups))
   708  		}
   709  	}
   710  
   711  	batchBuilder := newBatched(c.primary())
   712  	for _, group := range groups {
   713  		batchBuilder.appendBatchMutationsBySize(group.region, group.mutations, sizeFunc, txnCommitBatchSize)
   714  	}
   715  	firstIsPrimary := batchBuilder.setPrimary()
   716  
   717  	actionCommit, actionIsCommit := action.(actionCommit)
   718  	_, actionIsCleanup := action.(actionCleanup)
   719  	_, actionIsPessimisticLock := action.(actionPessimisticLock)
   720  
   721  	c.checkOnePCFallBack(action, len(batchBuilder.allBatches()))
   722  
   723  	var err error
   724  	if val, err := util.EvalFailpoint("skipKeyReturnOK"); err == nil {
   725  		valStr, ok := val.(string)
   726  		if ok && c.sessionID > 0 {
   727  			if firstIsPrimary && actionIsPessimisticLock {
   728  				logutil.Logger(bo.GetCtx()).Warn("pessimisticLock failpoint", zap.String("valStr", valStr))
   729  				switch valStr {
   730  				case "pessimisticLockSkipPrimary":
   731  					err = c.doActionOnBatches(bo, action, batchBuilder.allBatches())
   732  					return err
   733  				case "pessimisticLockSkipSecondary":
   734  					err = c.doActionOnBatches(bo, action, batchBuilder.primaryBatch())
   735  					return err
   736  				}
   737  			}
   738  		}
   739  	}
   740  	if _, err := util.EvalFailpoint("pessimisticRollbackDoNth"); err == nil {
   741  		_, actionIsPessimisticRollback := action.(actionPessimisticRollback)
   742  		if actionIsPessimisticRollback && c.sessionID > 0 {
   743  			logutil.Logger(bo.GetCtx()).Warn("pessimisticRollbackDoNth failpoint")
   744  			return nil
   745  		}
   746  	}
   747  
   748  	if firstIsPrimary &&
   749  		((actionIsCommit && !c.isAsyncCommit()) || actionIsCleanup || actionIsPessimisticLock) {
   750  		// primary should be committed(not async commit)/cleanup/pessimistically locked first
   751  		err = c.doActionOnBatches(bo, action, batchBuilder.primaryBatch())
   752  		if err != nil {
   753  			return errors.Trace(err)
   754  		}
   755  		if actionIsCommit && c.testingKnobs.bkAfterCommitPrimary != nil && c.testingKnobs.acAfterCommitPrimary != nil {
   756  			c.testingKnobs.acAfterCommitPrimary <- struct{}{}
   757  			<-c.testingKnobs.bkAfterCommitPrimary
   758  		}
   759  		batchBuilder.forgetPrimary()
   760  	}
   761  	util.EvalFailpoint("afterPrimaryBatch")
   762  
   763  	// Already spawned a goroutine for async commit transaction.
   764  	if actionIsCommit && !actionCommit.retry && !c.isAsyncCommit() {
   765  		secondaryBo := retry.NewBackofferWithVars(c.store.Ctx(), CommitSecondaryMaxBackoff, c.txn.vars)
   766  		c.store.WaitGroup().Add(1)
   767  		go func() {
   768  			defer c.store.WaitGroup().Done()
   769  			if c.sessionID > 0 {
   770  				if v, err := util.EvalFailpoint("beforeCommitSecondaries"); err == nil {
   771  					if s, ok := v.(string); !ok {
   772  						logutil.Logger(bo.GetCtx()).Info("[failpoint] sleep 2s before commit secondary keys",
   773  							zap.Uint64("sessionID", c.sessionID), zap.Uint64("txnStartTS", c.startTS), zap.Uint64("txnCommitTS", c.commitTS))
   774  						time.Sleep(2 * time.Second)
   775  					} else if s == "skip" {
   776  						logutil.Logger(bo.GetCtx()).Info("[failpoint] injected skip committing secondaries",
   777  							zap.Uint64("sessionID", c.sessionID), zap.Uint64("txnStartTS", c.startTS), zap.Uint64("txnCommitTS", c.commitTS))
   778  						return
   779  					}
   780  				}
   781  			}
   782  
   783  			e := c.doActionOnBatches(secondaryBo, action, batchBuilder.allBatches())
   784  			if e != nil {
   785  				logutil.BgLogger().Debug("2PC async doActionOnBatches",
   786  					zap.Uint64("session", c.sessionID),
   787  					zap.Stringer("action type", action),
   788  					zap.Error(e))
   789  				metrics.SecondaryLockCleanupFailureCounterCommit.Inc()
   790  			}
   791  		}()
   792  	} else {
   793  		err = c.doActionOnBatches(bo, action, batchBuilder.allBatches())
   794  	}
   795  	return errors.Trace(err)
   796  }
   797  
   798  // doActionOnBatches does action to batches in parallel.
   799  func (c *twoPhaseCommitter) doActionOnBatches(bo *retry.Backoffer, action twoPhaseCommitAction, batches []batchMutations) error {
   800  	if len(batches) == 0 {
   801  		return nil
   802  	}
   803  
   804  	noNeedFork := len(batches) == 1
   805  	if !noNeedFork {
   806  		if ac, ok := action.(actionCommit); ok && ac.retry {
   807  			noNeedFork = true
   808  		}
   809  	}
   810  	if noNeedFork {
   811  		for _, b := range batches {
   812  			e := action.handleSingleBatch(c, bo, b)
   813  			if e != nil {
   814  				logutil.BgLogger().Debug("2PC doActionOnBatches failed",
   815  					zap.Uint64("session", c.sessionID),
   816  					zap.Stringer("action type", action),
   817  					zap.Error(e),
   818  					zap.Uint64("txnStartTS", c.startTS))
   819  				return errors.Trace(e)
   820  			}
   821  		}
   822  		return nil
   823  	}
   824  	rateLim := len(batches)
   825  	// Set rateLim here for the large transaction.
   826  	// If the rate limit is too high, tikv will report service is busy.
   827  	// If the rate limit is too low, we can't full utilize the tikv's throughput.
   828  	// TODO: Find a self-adaptive way to control the rate limit here.
   829  	if rateLim > config.GetGlobalConfig().CommitterConcurrency {
   830  		rateLim = config.GetGlobalConfig().CommitterConcurrency
   831  	}
   832  	batchExecutor := newBatchExecutor(rateLim, c, action, bo)
   833  	err := batchExecutor.process(batches)
   834  	return errors.Trace(err)
   835  }
   836  
   837  func (c *twoPhaseCommitter) keyValueSize(key, value []byte) int {
   838  	return len(key) + len(value)
   839  }
   840  
   841  func (c *twoPhaseCommitter) keySize(key, value []byte) int {
   842  	return len(key)
   843  }
   844  
   845  func (c *twoPhaseCommitter) SetDiskFullOpt(level kvrpcpb.DiskFullOpt) {
   846  	c.diskFullOpt = level
   847  }
   848  
   849  type ttlManagerState uint32
   850  
   851  const (
   852  	stateUninitialized ttlManagerState = iota
   853  	stateRunning
   854  	stateClosed
   855  )
   856  
   857  type ttlManager struct {
   858  	state   ttlManagerState
   859  	ch      chan struct{}
   860  	lockCtx *kv.LockCtx
   861  }
   862  
   863  func (tm *ttlManager) run(c *twoPhaseCommitter, lockCtx *kv.LockCtx) {
   864  	if _, err := util.EvalFailpoint("doNotKeepAlive"); err == nil {
   865  		return
   866  	}
   867  
   868  	// Run only once.
   869  	if !atomic.CompareAndSwapUint32((*uint32)(&tm.state), uint32(stateUninitialized), uint32(stateRunning)) {
   870  		return
   871  	}
   872  	tm.ch = make(chan struct{})
   873  	tm.lockCtx = lockCtx
   874  
   875  	go keepAlive(c, tm.ch, c.primary(), lockCtx)
   876  }
   877  
   878  func (tm *ttlManager) close() {
   879  	if !atomic.CompareAndSwapUint32((*uint32)(&tm.state), uint32(stateRunning), uint32(stateClosed)) {
   880  		return
   881  	}
   882  	close(tm.ch)
   883  }
   884  
   885  func (tm *ttlManager) reset() {
   886  	if !atomic.CompareAndSwapUint32((*uint32)(&tm.state), uint32(stateRunning), uint32(stateUninitialized)) {
   887  		return
   888  	}
   889  	close(tm.ch)
   890  }
   891  
   892  const keepAliveMaxBackoff = 20000        // 20 seconds
   893  const pessimisticLockMaxBackoff = 600000 // 10 minutes
   894  const maxConsecutiveFailure = 10
   895  
   896  func keepAlive(c *twoPhaseCommitter, closeCh chan struct{}, primaryKey []byte, lockCtx *kv.LockCtx) {
   897  	// Ticker is set to 1/2 of the ManagedLockTTL.
   898  	ticker := time.NewTicker(time.Duration(atomic.LoadUint64(&ManagedLockTTL)) * time.Millisecond / 2)
   899  	defer ticker.Stop()
   900  	keepFail := 0
   901  	for {
   902  		select {
   903  		case <-closeCh:
   904  			return
   905  		case <-ticker.C:
   906  			// If kill signal is received, the ttlManager should exit.
   907  			if lockCtx != nil && lockCtx.Killed != nil && atomic.LoadUint32(lockCtx.Killed) != 0 {
   908  				return
   909  			}
   910  			bo := retry.NewBackofferWithVars(context.Background(), keepAliveMaxBackoff, c.txn.vars)
   911  			now, err := c.store.GetTimestampWithRetry(bo, c.txn.GetScope())
   912  			if err != nil {
   913  				logutil.Logger(bo.GetCtx()).Warn("keepAlive get tso fail",
   914  					zap.Error(err))
   915  				return
   916  			}
   917  
   918  			uptime := uint64(oracle.ExtractPhysical(now) - oracle.ExtractPhysical(c.startTS))
   919  			if uptime > config.GetGlobalConfig().MaxTxnTTL {
   920  				// Checks maximum lifetime for the ttlManager, so when something goes wrong
   921  				// the key will not be locked forever.
   922  				logutil.Logger(bo.GetCtx()).Info("ttlManager live up to its lifetime",
   923  					zap.Uint64("txnStartTS", c.startTS),
   924  					zap.Uint64("uptime", uptime),
   925  					zap.Uint64("maxTxnTTL", config.GetGlobalConfig().MaxTxnTTL))
   926  				metrics.TiKVTTLLifeTimeReachCounter.Inc()
   927  				// the pessimistic locks may expire if the ttl manager has timed out, set `LockExpired` flag
   928  				// so that this transaction could only commit or rollback with no more statement executions
   929  				if c.isPessimistic && lockCtx != nil && lockCtx.LockExpired != nil {
   930  					atomic.StoreUint32(lockCtx.LockExpired, 1)
   931  				}
   932  				return
   933  			}
   934  
   935  			newTTL := uptime + atomic.LoadUint64(&ManagedLockTTL)
   936  			logutil.Logger(bo.GetCtx()).Info("send TxnHeartBeat",
   937  				zap.Uint64("startTS", c.startTS), zap.Uint64("newTTL", newTTL))
   938  			startTime := time.Now()
   939  			_, stopHeartBeat, err := sendTxnHeartBeat(bo, c.store, primaryKey, c.startTS, newTTL)
   940  			if err != nil {
   941  				keepFail++
   942  				metrics.TxnHeartBeatHistogramError.Observe(time.Since(startTime).Seconds())
   943  				logutil.Logger(bo.GetCtx()).Debug("send TxnHeartBeat failed",
   944  					zap.Error(err),
   945  					zap.Uint64("txnStartTS", c.startTS))
   946  				if stopHeartBeat || keepFail > maxConsecutiveFailure {
   947  					logutil.Logger(bo.GetCtx()).Warn("stop TxnHeartBeat",
   948  						zap.Error(err),
   949  						zap.Int("consecutiveFailure", keepFail),
   950  						zap.Uint64("txnStartTS", c.startTS))
   951  					return
   952  				}
   953  				continue
   954  			}
   955  			keepFail = 0
   956  			metrics.TxnHeartBeatHistogramOK.Observe(time.Since(startTime).Seconds())
   957  		}
   958  	}
   959  }
   960  
   961  func sendTxnHeartBeat(bo *retry.Backoffer, store kvstore, primary []byte, startTS, ttl uint64) (newTTL uint64, stopHeartBeat bool, err error) {
   962  	req := tikvrpc.NewRequest(tikvrpc.CmdTxnHeartBeat, &kvrpcpb.TxnHeartBeatRequest{
   963  		PrimaryLock:   primary,
   964  		StartVersion:  startTS,
   965  		AdviseLockTtl: ttl,
   966  	})
   967  	for {
   968  		loc, err := store.GetRegionCache().LocateKey(bo, primary)
   969  		if err != nil {
   970  			return 0, false, errors.Trace(err)
   971  		}
   972  		req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds())
   973  		resp, err := store.SendReq(bo, req, loc.Region, client.ReadTimeoutShort)
   974  		if err != nil {
   975  			return 0, false, errors.Trace(err)
   976  		}
   977  		regionErr, err := resp.GetRegionError()
   978  		if err != nil {
   979  			return 0, false, errors.Trace(err)
   980  		}
   981  		if regionErr != nil {
   982  			// For other region error and the fake region error, backoff because
   983  			// there's something wrong.
   984  			// For the real EpochNotMatch error, don't backoff.
   985  			if regionErr.GetEpochNotMatch() == nil || locate.IsFakeRegionError(regionErr) {
   986  				err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String()))
   987  				if err != nil {
   988  					return 0, false, errors.Trace(err)
   989  				}
   990  			}
   991  			continue
   992  		}
   993  		if resp.Resp == nil {
   994  			return 0, false, errors.Trace(tikverr.ErrBodyMissing)
   995  		}
   996  		cmdResp := resp.Resp.(*kvrpcpb.TxnHeartBeatResponse)
   997  		if keyErr := cmdResp.GetError(); keyErr != nil {
   998  			return 0, true, errors.Errorf("txn %d heartbeat fail, primary key = %v, err = %s", startTS, hex.EncodeToString(primary), tikverr.ExtractKeyErr(keyErr))
   999  		}
  1000  		return cmdResp.GetLockTtl(), false, nil
  1001  	}
  1002  }
  1003  
  1004  // checkAsyncCommit checks if async commit protocol is available for current transaction commit, true is returned if possible.
  1005  func (c *twoPhaseCommitter) checkAsyncCommit() bool {
  1006  	// Disable async commit in local transactions
  1007  	if c.txn.GetScope() != oracle.GlobalTxnScope {
  1008  		return false
  1009  	}
  1010  
  1011  	asyncCommitCfg := config.GetGlobalConfig().TiKVClient.AsyncCommit
  1012  	// TODO the keys limit need more tests, this value makes the unit test pass by now.
  1013  	// Async commit is not compatible with Binlog because of the non unique timestamp issue.
  1014  	if c.txn.enableAsyncCommit &&
  1015  		uint(c.mutations.Len()) <= asyncCommitCfg.KeysLimit &&
  1016  		!c.shouldWriteBinlog() {
  1017  		totalKeySize := uint64(0)
  1018  		for i := 0; i < c.mutations.Len(); i++ {
  1019  			totalKeySize += uint64(len(c.mutations.GetKey(i)))
  1020  			if totalKeySize > asyncCommitCfg.TotalKeySizeLimit {
  1021  				return false
  1022  			}
  1023  		}
  1024  		return true
  1025  	}
  1026  	return false
  1027  }
  1028  
  1029  // checkOnePC checks if 1PC protocol is available for current transaction.
  1030  func (c *twoPhaseCommitter) checkOnePC() bool {
  1031  	// Disable 1PC in local transactions
  1032  	if c.txn.GetScope() != oracle.GlobalTxnScope {
  1033  		return false
  1034  	}
  1035  
  1036  	return !c.shouldWriteBinlog() && c.txn.enable1PC
  1037  }
  1038  
  1039  func (c *twoPhaseCommitter) needLinearizability() bool {
  1040  	return !c.txn.causalConsistency
  1041  }
  1042  
  1043  func (c *twoPhaseCommitter) isAsyncCommit() bool {
  1044  	return atomic.LoadUint32(&c.useAsyncCommit) > 0
  1045  }
  1046  
  1047  func (c *twoPhaseCommitter) setAsyncCommit(val bool) {
  1048  	if val {
  1049  		atomic.StoreUint32(&c.useAsyncCommit, 1)
  1050  	} else {
  1051  		atomic.StoreUint32(&c.useAsyncCommit, 0)
  1052  	}
  1053  }
  1054  
  1055  func (c *twoPhaseCommitter) isOnePC() bool {
  1056  	return atomic.LoadUint32(&c.useOnePC) > 0
  1057  }
  1058  
  1059  func (c *twoPhaseCommitter) setOnePC(val bool) {
  1060  	if val {
  1061  		atomic.StoreUint32(&c.useOnePC, 1)
  1062  	} else {
  1063  		atomic.StoreUint32(&c.useOnePC, 0)
  1064  	}
  1065  }
  1066  
  1067  func (c *twoPhaseCommitter) checkOnePCFallBack(action twoPhaseCommitAction, batchCount int) {
  1068  	if _, ok := action.(actionPrewrite); ok {
  1069  		if batchCount > 1 {
  1070  			c.setOnePC(false)
  1071  		}
  1072  	}
  1073  }
  1074  
  1075  const (
  1076  	cleanupMaxBackoff = 20000
  1077  	// TsoMaxBackoff is the max sleep time to get tso.
  1078  	TsoMaxBackoff = 15000
  1079  )
  1080  
  1081  func (c *twoPhaseCommitter) cleanup(ctx context.Context) {
  1082  	c.cleanWg.Add(1)
  1083  	c.store.WaitGroup().Add(1)
  1084  	go func() {
  1085  		defer c.store.WaitGroup().Done()
  1086  		if _, err := util.EvalFailpoint("commitFailedSkipCleanup"); err == nil {
  1087  			logutil.Logger(ctx).Info("[failpoint] injected skip cleanup secondaries on failure",
  1088  				zap.Uint64("txnStartTS", c.startTS))
  1089  			c.cleanWg.Done()
  1090  			return
  1091  		}
  1092  
  1093  		cleanupKeysCtx := context.WithValue(c.store.Ctx(), retry.TxnStartKey, ctx.Value(retry.TxnStartKey))
  1094  		var err error
  1095  		if !c.isOnePC() {
  1096  			err = c.cleanupMutations(retry.NewBackofferWithVars(cleanupKeysCtx, cleanupMaxBackoff, c.txn.vars), c.mutations)
  1097  		} else if c.isPessimistic {
  1098  			err = c.pessimisticRollbackMutations(retry.NewBackofferWithVars(cleanupKeysCtx, cleanupMaxBackoff, c.txn.vars), c.mutations)
  1099  		}
  1100  
  1101  		if err != nil {
  1102  			metrics.SecondaryLockCleanupFailureCounterRollback.Inc()
  1103  			logutil.Logger(ctx).Info("2PC cleanup failed", zap.Error(err), zap.Uint64("txnStartTS", c.startTS),
  1104  				zap.Bool("isPessimistic", c.isPessimistic), zap.Bool("isOnePC", c.isOnePC()))
  1105  		} else {
  1106  			logutil.Logger(ctx).Debug("2PC clean up done",
  1107  				zap.Uint64("txnStartTS", c.startTS), zap.Bool("isPessimistic", c.isPessimistic),
  1108  				zap.Bool("isOnePC", c.isOnePC()))
  1109  		}
  1110  		c.cleanWg.Done()
  1111  	}()
  1112  }
  1113  
  1114  // execute executes the two-phase commit protocol.
  1115  func (c *twoPhaseCommitter) execute(ctx context.Context) (err error) {
  1116  	var binlogSkipped bool
  1117  	defer func() {
  1118  		if c.isOnePC() {
  1119  			// The error means the 1PC transaction failed.
  1120  			if err != nil {
  1121  				if c.getUndeterminedErr() == nil {
  1122  					c.cleanup(ctx)
  1123  				}
  1124  				metrics.OnePCTxnCounterError.Inc()
  1125  			} else {
  1126  				metrics.OnePCTxnCounterOk.Inc()
  1127  			}
  1128  		} else if c.isAsyncCommit() {
  1129  			// The error means the async commit should not succeed.
  1130  			if err != nil {
  1131  				if c.getUndeterminedErr() == nil {
  1132  					c.cleanup(ctx)
  1133  				}
  1134  				metrics.AsyncCommitTxnCounterError.Inc()
  1135  			} else {
  1136  				metrics.AsyncCommitTxnCounterOk.Inc()
  1137  			}
  1138  		} else {
  1139  			// Always clean up all written keys if the txn does not commit.
  1140  			c.mu.RLock()
  1141  			committed := c.mu.committed
  1142  			undetermined := c.mu.undeterminedErr != nil
  1143  			c.mu.RUnlock()
  1144  			if !committed && !undetermined {
  1145  				c.cleanup(ctx)
  1146  				metrics.TwoPCTxnCounterError.Inc()
  1147  			} else {
  1148  				metrics.TwoPCTxnCounterOk.Inc()
  1149  			}
  1150  			c.txn.commitTS = c.commitTS
  1151  			if binlogSkipped {
  1152  				c.binlog.Skip()
  1153  				return
  1154  			}
  1155  			if !c.shouldWriteBinlog() {
  1156  				return
  1157  			}
  1158  			if err != nil {
  1159  				c.binlog.Commit(ctx, 0)
  1160  			} else {
  1161  				c.binlog.Commit(ctx, int64(c.commitTS))
  1162  			}
  1163  		}
  1164  	}()
  1165  
  1166  	commitTSMayBeCalculated := false
  1167  	// Check async commit is available or not.
  1168  	if c.checkAsyncCommit() {
  1169  		commitTSMayBeCalculated = true
  1170  		c.setAsyncCommit(true)
  1171  		c.hasTriedAsyncCommit = true
  1172  	}
  1173  	// Check if 1PC is enabled.
  1174  	if c.checkOnePC() {
  1175  		commitTSMayBeCalculated = true
  1176  		c.setOnePC(true)
  1177  		c.hasTriedOnePC = true
  1178  	}
  1179  
  1180  	// TODO(youjiali1995): It's better to use different maxSleep for different operations
  1181  	// and distinguish permanent errors from temporary errors, for example:
  1182  	//   - If all PDs are down, all requests to PD will fail due to network error.
  1183  	//     The maxSleep should't be very long in this case.
  1184  	//   - If the region isn't found in PD, it's possible the reason is write-stall.
  1185  	//     The maxSleep can be long in this case.
  1186  	bo := retry.NewBackofferWithVars(ctx, PrewriteMaxBackoff, c.txn.vars)
  1187  
  1188  	// If we want to use async commit or 1PC and also want linearizability across
  1189  	// all nodes, we have to make sure the commit TS of this transaction is greater
  1190  	// than the snapshot TS of all existent readers. So we get a new timestamp
  1191  	// from PD and plus one as our MinCommitTS.
  1192  	if commitTSMayBeCalculated && c.needLinearizability() {
  1193  		util.EvalFailpoint("getMinCommitTSFromTSO")
  1194  		latestTS, err := c.store.GetTimestampWithRetry(bo, c.txn.GetScope())
  1195  		// If we fail to get a timestamp from PD, we just propagate the failure
  1196  		// instead of falling back to the normal 2PC because a normal 2PC will
  1197  		// also be likely to fail due to the same timestamp issue.
  1198  		if err != nil {
  1199  			return errors.Trace(err)
  1200  		}
  1201  		// Plus 1 to avoid producing the same commit TS with previously committed transactions
  1202  		c.minCommitTS = latestTS + 1
  1203  	}
  1204  	// Calculate maxCommitTS if necessary
  1205  	if commitTSMayBeCalculated {
  1206  		if err = c.calculateMaxCommitTS(ctx); err != nil {
  1207  			return errors.Trace(err)
  1208  		}
  1209  	}
  1210  
  1211  	if c.sessionID > 0 {
  1212  		util.EvalFailpoint("beforePrewrite")
  1213  	}
  1214  
  1215  	c.prewriteStarted = true
  1216  	var binlogChan <-chan BinlogWriteResult
  1217  	if c.shouldWriteBinlog() {
  1218  		binlogChan = c.binlog.Prewrite(ctx, c.primary())
  1219  	}
  1220  
  1221  	start := time.Now()
  1222  	err = c.prewriteMutations(bo, c.mutations)
  1223  
  1224  	if err != nil {
  1225  		// TODO: Now we return an undetermined error as long as one of the prewrite
  1226  		// RPCs fails. However, if there are multiple errors and some of the errors
  1227  		// are not RPC failures, we can return the actual error instead of undetermined.
  1228  		if undeterminedErr := c.getUndeterminedErr(); undeterminedErr != nil {
  1229  			logutil.Logger(ctx).Error("2PC commit result undetermined",
  1230  				zap.Error(err),
  1231  				zap.NamedError("rpcErr", undeterminedErr),
  1232  				zap.Uint64("txnStartTS", c.startTS))
  1233  			return errors.Trace(terror.ErrResultUndetermined)
  1234  		}
  1235  	}
  1236  
  1237  	commitDetail := c.getDetail()
  1238  	commitDetail.PrewriteTime = time.Since(start)
  1239  	if bo.GetTotalSleep() > 0 {
  1240  		boSleep := int64(bo.GetTotalSleep()) * int64(time.Millisecond)
  1241  		commitDetail.Mu.Lock()
  1242  		if boSleep > commitDetail.Mu.CommitBackoffTime {
  1243  			commitDetail.Mu.CommitBackoffTime = boSleep
  1244  			commitDetail.Mu.BackoffTypes = bo.GetTypes()
  1245  		}
  1246  		commitDetail.Mu.Unlock()
  1247  	}
  1248  
  1249  	if binlogChan != nil {
  1250  		startWaitBinlog := time.Now()
  1251  		binlogWriteResult := <-binlogChan
  1252  		commitDetail.WaitPrewriteBinlogTime = time.Since(startWaitBinlog)
  1253  		if binlogWriteResult != nil {
  1254  			binlogSkipped = binlogWriteResult.Skipped()
  1255  			binlogErr := binlogWriteResult.GetError()
  1256  			if binlogErr != nil {
  1257  				return binlogErr
  1258  			}
  1259  		}
  1260  	}
  1261  	if err != nil {
  1262  		logutil.Logger(ctx).Debug("2PC failed on prewrite",
  1263  			zap.Error(err),
  1264  			zap.Uint64("txnStartTS", c.startTS))
  1265  		return errors.Trace(err)
  1266  	}
  1267  
  1268  	// strip check_not_exists keys that no need to commit.
  1269  	c.stripNoNeedCommitKeys()
  1270  
  1271  	var commitTS uint64
  1272  
  1273  	if c.isOnePC() {
  1274  		if c.onePCCommitTS == 0 {
  1275  			err = errors.Errorf("session %d invalid onePCCommitTS for 1PC protocol after prewrite, startTS=%v", c.sessionID, c.startTS)
  1276  			return errors.Trace(err)
  1277  		}
  1278  		c.commitTS = c.onePCCommitTS
  1279  		c.txn.commitTS = c.commitTS
  1280  		logutil.Logger(ctx).Debug("1PC protocol is used to commit this txn",
  1281  			zap.Uint64("startTS", c.startTS), zap.Uint64("commitTS", c.commitTS),
  1282  			zap.Uint64("session", c.sessionID))
  1283  		return nil
  1284  	}
  1285  
  1286  	if c.onePCCommitTS != 0 {
  1287  		logutil.Logger(ctx).Fatal("non 1PC transaction committed in 1PC",
  1288  			zap.Uint64("session", c.sessionID), zap.Uint64("startTS", c.startTS))
  1289  	}
  1290  
  1291  	if c.isAsyncCommit() {
  1292  		if c.minCommitTS == 0 {
  1293  			err = errors.Errorf("session %d invalid minCommitTS for async commit protocol after prewrite, startTS=%v", c.sessionID, c.startTS)
  1294  			return errors.Trace(err)
  1295  		}
  1296  		commitTS = c.minCommitTS
  1297  	} else {
  1298  		start = time.Now()
  1299  		logutil.Event(ctx, "start get commit ts")
  1300  		commitTS, err = c.store.GetTimestampWithRetry(retry.NewBackofferWithVars(ctx, TsoMaxBackoff, c.txn.vars), c.txn.GetScope())
  1301  		if err != nil {
  1302  			logutil.Logger(ctx).Warn("2PC get commitTS failed",
  1303  				zap.Error(err),
  1304  				zap.Uint64("txnStartTS", c.startTS))
  1305  			return errors.Trace(err)
  1306  		}
  1307  		commitDetail.GetCommitTsTime = time.Since(start)
  1308  		logutil.Event(ctx, "finish get commit ts")
  1309  		logutil.SetTag(ctx, "commitTs", commitTS)
  1310  	}
  1311  
  1312  	if !c.isAsyncCommit() {
  1313  		tryAmend := c.isPessimistic && c.sessionID > 0 && c.txn.schemaAmender != nil
  1314  		if !tryAmend {
  1315  			_, _, err = c.checkSchemaValid(ctx, commitTS, c.txn.schemaVer, false)
  1316  			if err != nil {
  1317  				return errors.Trace(err)
  1318  			}
  1319  		} else {
  1320  			relatedSchemaChange, memAmended, err := c.checkSchemaValid(ctx, commitTS, c.txn.schemaVer, true)
  1321  			if err != nil {
  1322  				return errors.Trace(err)
  1323  			}
  1324  			if memAmended {
  1325  				// Get new commitTS and check schema valid again.
  1326  				newCommitTS, err := c.getCommitTS(ctx, commitDetail)
  1327  				if err != nil {
  1328  					return errors.Trace(err)
  1329  				}
  1330  				// If schema check failed between commitTS and newCommitTs, report schema change error.
  1331  				_, _, err = c.checkSchemaValid(ctx, newCommitTS, relatedSchemaChange.LatestInfoSchema, false)
  1332  				if err != nil {
  1333  					logutil.Logger(ctx).Info("schema check after amend failed, it means the schema version changed again",
  1334  						zap.Uint64("startTS", c.startTS),
  1335  						zap.Uint64("amendTS", commitTS),
  1336  						zap.Int64("amendedSchemaVersion", relatedSchemaChange.LatestInfoSchema.SchemaMetaVersion()),
  1337  						zap.Uint64("newCommitTS", newCommitTS))
  1338  					return errors.Trace(err)
  1339  				}
  1340  				commitTS = newCommitTS
  1341  			}
  1342  		}
  1343  	}
  1344  	atomic.StoreUint64(&c.commitTS, commitTS)
  1345  
  1346  	if c.store.GetOracle().IsExpired(c.startTS, MaxTxnTimeUse, &oracle.Option{TxnScope: oracle.GlobalTxnScope}) {
  1347  		err = errors.Errorf("session %d txn takes too much time, txnStartTS: %d, comm: %d",
  1348  			c.sessionID, c.startTS, c.commitTS)
  1349  		return err
  1350  	}
  1351  
  1352  	if c.sessionID > 0 {
  1353  		if val, err := util.EvalFailpoint("beforeCommit"); err == nil {
  1354  			// Pass multiple instructions in one string, delimited by commas, to trigger multiple behaviors, like
  1355  			// `return("delay,fail")`. Then they will be executed sequentially at once.
  1356  			if v, ok := val.(string); ok {
  1357  				for _, action := range strings.Split(v, ",") {
  1358  					// Async commit transactions cannot return error here, since it's already successful.
  1359  					if action == "fail" && !c.isAsyncCommit() {
  1360  						logutil.Logger(ctx).Info("[failpoint] injected failure before commit", zap.Uint64("txnStartTS", c.startTS))
  1361  						return errors.New("injected failure before commit")
  1362  					} else if action == "delay" {
  1363  						duration := time.Duration(rand.Int63n(int64(time.Second) * 5))
  1364  						logutil.Logger(ctx).Info("[failpoint] injected delay before commit",
  1365  							zap.Uint64("txnStartTS", c.startTS), zap.Duration("duration", duration))
  1366  						time.Sleep(duration)
  1367  					}
  1368  				}
  1369  			}
  1370  		}
  1371  	}
  1372  
  1373  	if c.isAsyncCommit() {
  1374  		// For async commit protocol, the commit is considered success here.
  1375  		c.txn.commitTS = c.commitTS
  1376  		logutil.Logger(ctx).Debug("2PC will use async commit protocol to commit this txn",
  1377  			zap.Uint64("startTS", c.startTS), zap.Uint64("commitTS", c.commitTS),
  1378  			zap.Uint64("sessionID", c.sessionID))
  1379  		c.store.WaitGroup().Add(1)
  1380  		go func() {
  1381  			defer c.store.WaitGroup().Done()
  1382  			if _, err := util.EvalFailpoint("asyncCommitDoNothing"); err == nil {
  1383  				return
  1384  			}
  1385  			commitBo := retry.NewBackofferWithVars(c.store.Ctx(), CommitSecondaryMaxBackoff, c.txn.vars)
  1386  			err := c.commitMutations(commitBo, c.mutations)
  1387  			if err != nil {
  1388  				logutil.Logger(ctx).Warn("2PC async commit failed", zap.Uint64("sessionID", c.sessionID),
  1389  					zap.Uint64("startTS", c.startTS), zap.Uint64("commitTS", c.commitTS), zap.Error(err))
  1390  			}
  1391  		}()
  1392  		return nil
  1393  	}
  1394  	return c.commitTxn(ctx, commitDetail)
  1395  }
  1396  
  1397  func (c *twoPhaseCommitter) commitTxn(ctx context.Context, commitDetail *util.CommitDetails) error {
  1398  	c.txn.GetMemBuffer().DiscardValues()
  1399  	start := time.Now()
  1400  
  1401  	// Use the VeryLongMaxBackoff to commit the primary key.
  1402  	commitBo := retry.NewBackofferWithVars(ctx, int(CommitMaxBackoff), c.txn.vars)
  1403  	err := c.commitMutations(commitBo, c.mutations)
  1404  	commitDetail.CommitTime = time.Since(start)
  1405  	if commitBo.GetTotalSleep() > 0 {
  1406  		commitDetail.Mu.Lock()
  1407  		commitDetail.Mu.CommitBackoffTime += int64(commitBo.GetTotalSleep()) * int64(time.Millisecond)
  1408  		commitDetail.Mu.BackoffTypes = append(commitDetail.Mu.BackoffTypes, commitBo.GetTypes()...)
  1409  		commitDetail.Mu.Unlock()
  1410  	}
  1411  	if err != nil {
  1412  		if undeterminedErr := c.getUndeterminedErr(); undeterminedErr != nil {
  1413  			logutil.Logger(ctx).Error("2PC commit result undetermined",
  1414  				zap.Error(err),
  1415  				zap.NamedError("rpcErr", undeterminedErr),
  1416  				zap.Uint64("txnStartTS", c.startTS))
  1417  			err = errors.Trace(terror.ErrResultUndetermined)
  1418  		}
  1419  		if !c.mu.committed {
  1420  			logutil.Logger(ctx).Debug("2PC failed on commit",
  1421  				zap.Error(err),
  1422  				zap.Uint64("txnStartTS", c.startTS))
  1423  			return errors.Trace(err)
  1424  		}
  1425  		logutil.Logger(ctx).Debug("got some exceptions, but 2PC was still successful",
  1426  			zap.Error(err),
  1427  			zap.Uint64("txnStartTS", c.startTS))
  1428  	}
  1429  	return nil
  1430  }
  1431  
  1432  func (c *twoPhaseCommitter) stripNoNeedCommitKeys() {
  1433  	if !c.hasNoNeedCommitKeys {
  1434  		return
  1435  	}
  1436  	m := c.mutations
  1437  	var newIdx int
  1438  	for oldIdx := range m.handles {
  1439  		key := m.GetKey(oldIdx)
  1440  		flags, err := c.txn.GetMemBuffer().GetFlags(key)
  1441  		if err == nil && flags.HasPrewriteOnly() {
  1442  			continue
  1443  		}
  1444  		m.handles[newIdx] = m.handles[oldIdx]
  1445  		newIdx++
  1446  	}
  1447  	c.mutations.handles = c.mutations.handles[:newIdx]
  1448  }
  1449  
  1450  // SchemaVer is the infoSchema which will return the schema version.
  1451  type SchemaVer interface {
  1452  	// SchemaMetaVersion returns the meta schema version.
  1453  	SchemaMetaVersion() int64
  1454  }
  1455  
  1456  // SchemaLeaseChecker is used to validate schema version is not changed during transaction execution.
  1457  type SchemaLeaseChecker interface {
  1458  	// CheckBySchemaVer checks if the schema has changed for the transaction related tables between the startSchemaVer
  1459  	// and the schema version at txnTS, all the related schema changes will be returned.
  1460  	CheckBySchemaVer(txnTS uint64, startSchemaVer SchemaVer) (*RelatedSchemaChange, error)
  1461  }
  1462  
  1463  // RelatedSchemaChange contains information about schema diff between two schema versions.
  1464  type RelatedSchemaChange struct {
  1465  	PhyTblIDS        []int64
  1466  	ActionTypes      []uint64
  1467  	LatestInfoSchema SchemaVer
  1468  	Amendable        bool
  1469  }
  1470  
  1471  func (c *twoPhaseCommitter) amendPessimisticLock(ctx context.Context, addMutations CommitterMutations) error {
  1472  	keysNeedToLock := NewPlainMutations(addMutations.Len())
  1473  	for i := 0; i < addMutations.Len(); i++ {
  1474  		if addMutations.IsPessimisticLock(i) {
  1475  			keysNeedToLock.Push(addMutations.GetOp(i), addMutations.GetKey(i), addMutations.GetValue(i), addMutations.IsPessimisticLock(i))
  1476  		}
  1477  	}
  1478  	// For unique index amend, we need to pessimistic lock the generated new index keys first.
  1479  	// Set doingAmend to true to force the pessimistic lock do the exist check for these keys.
  1480  	c.doingAmend = true
  1481  	defer func() { c.doingAmend = false }()
  1482  	if keysNeedToLock.Len() > 0 {
  1483  		lCtx := kv.NewLockCtx(c.forUpdateTS, c.lockCtx.LockWaitTime(), time.Now())
  1484  		lCtx.Killed = c.lockCtx.Killed
  1485  		tryTimes := uint(0)
  1486  		retryLimit := config.GetGlobalConfig().PessimisticTxn.MaxRetryCount
  1487  		var err error
  1488  		for tryTimes < retryLimit {
  1489  			pessimisticLockBo := retry.NewBackofferWithVars(ctx, pessimisticLockMaxBackoff, c.txn.vars)
  1490  			err = c.pessimisticLockMutations(pessimisticLockBo, lCtx, &keysNeedToLock)
  1491  			if err != nil {
  1492  				// KeysNeedToLock won't change, so don't async rollback pessimistic locks here for write conflict.
  1493  				if _, ok := errors.Cause(err).(*tikverr.ErrWriteConflict); ok {
  1494  					newForUpdateTSVer, err := c.store.CurrentTimestamp(oracle.GlobalTxnScope)
  1495  					if err != nil {
  1496  						return errors.Trace(err)
  1497  					}
  1498  					lCtx.ForUpdateTS = newForUpdateTSVer
  1499  					c.forUpdateTS = newForUpdateTSVer
  1500  					logutil.Logger(ctx).Info("amend pessimistic lock pessimistic retry lock",
  1501  						zap.Uint("tryTimes", tryTimes), zap.Uint64("startTS", c.startTS),
  1502  						zap.Uint64("newForUpdateTS", c.forUpdateTS))
  1503  					tryTimes++
  1504  					continue
  1505  				}
  1506  				logutil.Logger(ctx).Warn("amend pessimistic lock has failed", zap.Error(err), zap.Uint64("txnStartTS", c.startTS))
  1507  				return err
  1508  			}
  1509  			logutil.Logger(ctx).Info("amend pessimistic lock finished", zap.Uint64("startTS", c.startTS),
  1510  				zap.Uint64("forUpdateTS", c.forUpdateTS), zap.Int("keys", keysNeedToLock.Len()))
  1511  			break
  1512  		}
  1513  		if err != nil {
  1514  			logutil.Logger(ctx).Warn("amend pessimistic lock failed after retry",
  1515  				zap.Uint("tryTimes", tryTimes), zap.Uint64("startTS", c.startTS))
  1516  			return err
  1517  		}
  1518  	}
  1519  	return nil
  1520  }
  1521  
  1522  func (c *twoPhaseCommitter) tryAmendTxn(ctx context.Context, startInfoSchema SchemaVer, change *RelatedSchemaChange) (bool, error) {
  1523  	addMutations, err := c.txn.schemaAmender.AmendTxn(ctx, startInfoSchema, change, c.mutations)
  1524  	if err != nil {
  1525  		return false, err
  1526  	}
  1527  	// Add new mutations to the mutation list or prewrite them if prewrite already starts.
  1528  	if addMutations != nil && addMutations.Len() > 0 {
  1529  		err = c.amendPessimisticLock(ctx, addMutations)
  1530  		if err != nil {
  1531  			logutil.Logger(ctx).Info("amendPessimisticLock has failed", zap.Error(err))
  1532  			return false, err
  1533  		}
  1534  		if c.prewriteStarted {
  1535  			prewriteBo := retry.NewBackofferWithVars(ctx, PrewriteMaxBackoff, c.txn.vars)
  1536  			err = c.prewriteMutations(prewriteBo, addMutations)
  1537  			if err != nil {
  1538  				logutil.Logger(ctx).Warn("amend prewrite has failed", zap.Error(err), zap.Uint64("txnStartTS", c.startTS))
  1539  				return false, err
  1540  			}
  1541  			logutil.Logger(ctx).Info("amend prewrite finished", zap.Uint64("txnStartTS", c.startTS))
  1542  			return true, nil
  1543  		}
  1544  		memBuf := c.txn.GetMemBuffer()
  1545  		for i := 0; i < addMutations.Len(); i++ {
  1546  			key := addMutations.GetKey(i)
  1547  			op := addMutations.GetOp(i)
  1548  			var err error
  1549  			if op == kvrpcpb.Op_Del {
  1550  				err = memBuf.Delete(key)
  1551  			} else {
  1552  				err = memBuf.Set(key, addMutations.GetValue(i))
  1553  			}
  1554  			if err != nil {
  1555  				logutil.Logger(ctx).Warn("amend mutations has failed", zap.Error(err), zap.Uint64("txnStartTS", c.startTS))
  1556  				return false, err
  1557  			}
  1558  			handle := c.txn.GetMemBuffer().IterWithFlags(key, nil).Handle()
  1559  			c.mutations.Push(op, addMutations.IsPessimisticLock(i), handle)
  1560  		}
  1561  	}
  1562  	return false, nil
  1563  }
  1564  
  1565  func (c *twoPhaseCommitter) getCommitTS(ctx context.Context, commitDetail *util.CommitDetails) (uint64, error) {
  1566  	start := time.Now()
  1567  	logutil.Event(ctx, "start get commit ts")
  1568  	commitTS, err := c.store.GetTimestampWithRetry(retry.NewBackofferWithVars(ctx, TsoMaxBackoff, c.txn.vars), c.txn.GetScope())
  1569  	if err != nil {
  1570  		logutil.Logger(ctx).Warn("2PC get commitTS failed",
  1571  			zap.Error(err),
  1572  			zap.Uint64("txnStartTS", c.startTS))
  1573  		return 0, errors.Trace(err)
  1574  	}
  1575  	commitDetail.GetCommitTsTime = time.Since(start)
  1576  	logutil.Event(ctx, "finish get commit ts")
  1577  	logutil.SetTag(ctx, "commitTS", commitTS)
  1578  
  1579  	// Check commitTS.
  1580  	if commitTS <= c.startTS {
  1581  		err = errors.Errorf("session %d invalid transaction tso with txnStartTS=%v while txnCommitTS=%v",
  1582  			c.sessionID, c.startTS, commitTS)
  1583  		logutil.BgLogger().Error("invalid transaction", zap.Error(err))
  1584  		return 0, errors.Trace(err)
  1585  	}
  1586  	return commitTS, nil
  1587  }
  1588  
  1589  // checkSchemaValid checks if the schema has changed, if tryAmend is set to true, committer will try to amend
  1590  // this transaction using the related schema changes.
  1591  func (c *twoPhaseCommitter) checkSchemaValid(ctx context.Context, checkTS uint64, startInfoSchema SchemaVer,
  1592  	tryAmend bool) (*RelatedSchemaChange, bool, error) {
  1593  	if _, err := util.EvalFailpoint("failCheckSchemaValid"); err == nil {
  1594  		logutil.Logger(ctx).Info("[failpoint] injected fail schema check",
  1595  			zap.Uint64("txnStartTS", c.startTS))
  1596  		err := errors.Errorf("mock check schema valid failure")
  1597  		return nil, false, err
  1598  	}
  1599  	if c.txn.schemaLeaseChecker == nil {
  1600  		if c.sessionID > 0 {
  1601  			logutil.Logger(ctx).Warn("schemaLeaseChecker is not set for this transaction",
  1602  				zap.Uint64("sessionID", c.sessionID),
  1603  				zap.Uint64("startTS", c.startTS),
  1604  				zap.Uint64("commitTS", checkTS))
  1605  		}
  1606  		return nil, false, nil
  1607  	}
  1608  	relatedChanges, err := c.txn.schemaLeaseChecker.CheckBySchemaVer(checkTS, startInfoSchema)
  1609  	if err != nil {
  1610  		if tryAmend && relatedChanges != nil && relatedChanges.Amendable && c.txn.schemaAmender != nil {
  1611  			memAmended, amendErr := c.tryAmendTxn(ctx, startInfoSchema, relatedChanges)
  1612  			if amendErr != nil {
  1613  				logutil.BgLogger().Info("txn amend has failed", zap.Uint64("sessionID", c.sessionID),
  1614  					zap.Uint64("startTS", c.startTS), zap.Error(amendErr))
  1615  				return nil, false, err
  1616  			}
  1617  			logutil.Logger(ctx).Info("amend txn successfully",
  1618  				zap.Uint64("sessionID", c.sessionID), zap.Uint64("txn startTS", c.startTS), zap.Bool("memAmended", memAmended),
  1619  				zap.Uint64("checkTS", checkTS), zap.Int64("startInfoSchemaVer", startInfoSchema.SchemaMetaVersion()),
  1620  				zap.Int64s("table ids", relatedChanges.PhyTblIDS), zap.Uint64s("action types", relatedChanges.ActionTypes))
  1621  			return relatedChanges, memAmended, nil
  1622  		}
  1623  		return nil, false, errors.Trace(err)
  1624  	}
  1625  	return nil, false, nil
  1626  }
  1627  
  1628  func (c *twoPhaseCommitter) calculateMaxCommitTS(ctx context.Context) error {
  1629  	// Amend txn with current time first, then we can make sure we have another SafeWindow time to commit
  1630  	currentTS := oracle.ComposeTS(int64(time.Since(c.txn.startTime)/time.Millisecond), 0) + c.startTS
  1631  	_, _, err := c.checkSchemaValid(ctx, currentTS, c.txn.schemaVer, true)
  1632  	if err != nil {
  1633  		logutil.Logger(ctx).Info("Schema changed for async commit txn",
  1634  			zap.Error(err),
  1635  			zap.Uint64("startTS", c.startTS))
  1636  		return errors.Trace(err)
  1637  	}
  1638  
  1639  	safeWindow := config.GetGlobalConfig().TiKVClient.AsyncCommit.SafeWindow
  1640  	maxCommitTS := oracle.ComposeTS(int64(safeWindow/time.Millisecond), 0) + currentTS
  1641  	logutil.BgLogger().Debug("calculate MaxCommitTS",
  1642  		zap.Time("startTime", c.txn.startTime),
  1643  		zap.Duration("safeWindow", safeWindow),
  1644  		zap.Uint64("startTS", c.startTS),
  1645  		zap.Uint64("maxCommitTS", maxCommitTS))
  1646  
  1647  	c.maxCommitTS = maxCommitTS
  1648  	return nil
  1649  }
  1650  
  1651  func (c *twoPhaseCommitter) shouldWriteBinlog() bool {
  1652  	return c.binlog != nil
  1653  }
  1654  
  1655  // TiKV recommends each RPC packet should be less than ~1MB. We keep each packet's
  1656  // Key+Value size below 16KB.
  1657  const txnCommitBatchSize = 16 * 1024
  1658  
  1659  type batchMutations struct {
  1660  	region    locate.RegionVerID
  1661  	mutations CommitterMutations
  1662  	isPrimary bool
  1663  }
  1664  
  1665  func (b *batchMutations) relocate(bo *retry.Backoffer, c *locate.RegionCache) (bool, error) {
  1666  	begin, end := b.mutations.GetKey(0), b.mutations.GetKey(b.mutations.Len()-1)
  1667  	loc, err := c.LocateKey(bo, begin)
  1668  	if err != nil {
  1669  		return false, errors.Trace(err)
  1670  	}
  1671  	if !loc.Contains(end) {
  1672  		return false, nil
  1673  	}
  1674  	b.region = loc.Region
  1675  	return true, nil
  1676  }
  1677  
  1678  type batched struct {
  1679  	batches    []batchMutations
  1680  	primaryIdx int
  1681  	primaryKey []byte
  1682  }
  1683  
  1684  func newBatched(primaryKey []byte) *batched {
  1685  	return &batched{
  1686  		primaryIdx: -1,
  1687  		primaryKey: primaryKey,
  1688  	}
  1689  }
  1690  
  1691  // appendBatchMutationsBySize appends mutations to b. It may split the keys to make
  1692  // sure each batch's size does not exceed the limit.
  1693  func (b *batched) appendBatchMutationsBySize(region locate.RegionVerID, mutations CommitterMutations, sizeFn func(k, v []byte) int, limit int) {
  1694  	if _, err := util.EvalFailpoint("twoPCRequestBatchSizeLimit"); err == nil {
  1695  		limit = 1
  1696  	}
  1697  
  1698  	var start, end int
  1699  	for start = 0; start < mutations.Len(); start = end {
  1700  		var size int
  1701  		for end = start; end < mutations.Len() && size < limit; end++ {
  1702  			var k, v []byte
  1703  			k = mutations.GetKey(end)
  1704  			v = mutations.GetValue(end)
  1705  			size += sizeFn(k, v)
  1706  			if b.primaryIdx < 0 && bytes.Equal(k, b.primaryKey) {
  1707  				b.primaryIdx = len(b.batches)
  1708  			}
  1709  		}
  1710  		b.batches = append(b.batches, batchMutations{
  1711  			region:    region,
  1712  			mutations: mutations.Slice(start, end),
  1713  		})
  1714  	}
  1715  }
  1716  
  1717  func (b *batched) setPrimary() bool {
  1718  	// If the batches include the primary key, put it to the first
  1719  	if b.primaryIdx >= 0 {
  1720  		if len(b.batches) > 0 {
  1721  			b.batches[b.primaryIdx].isPrimary = true
  1722  			b.batches[0], b.batches[b.primaryIdx] = b.batches[b.primaryIdx], b.batches[0]
  1723  			b.primaryIdx = 0
  1724  		}
  1725  		return true
  1726  	}
  1727  
  1728  	return false
  1729  }
  1730  
  1731  func (b *batched) allBatches() []batchMutations {
  1732  	return b.batches
  1733  }
  1734  
  1735  // primaryBatch returns the batch containing the primary key.
  1736  // Precondition: `b.setPrimary() == true`
  1737  func (b *batched) primaryBatch() []batchMutations {
  1738  	return b.batches[:1]
  1739  }
  1740  
  1741  func (b *batched) forgetPrimary() {
  1742  	if len(b.batches) == 0 {
  1743  		return
  1744  	}
  1745  	b.batches = b.batches[1:]
  1746  }
  1747  
  1748  // batchExecutor is txn controller providing rate control like utils
  1749  type batchExecutor struct {
  1750  	rateLim           int                  // concurrent worker numbers
  1751  	rateLimiter       *util.RateLimit      // rate limiter for concurrency control, maybe more strategies
  1752  	committer         *twoPhaseCommitter   // here maybe more different type committer in the future
  1753  	action            twoPhaseCommitAction // the work action type
  1754  	backoffer         *retry.Backoffer     // Backoffer
  1755  	tokenWaitDuration time.Duration        // get token wait time
  1756  }
  1757  
  1758  // newBatchExecutor create processor to handle concurrent batch works(prewrite/commit etc)
  1759  func newBatchExecutor(rateLimit int, committer *twoPhaseCommitter,
  1760  	action twoPhaseCommitAction, backoffer *retry.Backoffer) *batchExecutor {
  1761  	return &batchExecutor{rateLimit, nil, committer,
  1762  		action, backoffer, 0}
  1763  }
  1764  
  1765  // initUtils do initialize batchExecutor related policies like rateLimit util
  1766  func (batchExe *batchExecutor) initUtils() error {
  1767  	// init rateLimiter by injected rate limit number
  1768  	batchExe.rateLimiter = util.NewRateLimit(batchExe.rateLim)
  1769  	return nil
  1770  }
  1771  
  1772  // startWork concurrently do the work for each batch considering rate limit
  1773  func (batchExe *batchExecutor) startWorker(exitCh chan struct{}, ch chan error, batches []batchMutations) {
  1774  	for idx, batch1 := range batches {
  1775  		waitStart := time.Now()
  1776  		if exit := batchExe.rateLimiter.GetToken(exitCh); !exit {
  1777  			batchExe.tokenWaitDuration += time.Since(waitStart)
  1778  			batch := batch1
  1779  			go func() {
  1780  				defer batchExe.rateLimiter.PutToken()
  1781  				var singleBatchBackoffer *retry.Backoffer
  1782  				if _, ok := batchExe.action.(actionCommit); ok {
  1783  					// Because the secondary batches of the commit actions are implemented to be
  1784  					// committed asynchronously in background goroutines, we should not
  1785  					// fork a child context and call cancel() while the foreground goroutine exits.
  1786  					// Otherwise the background goroutines will be canceled execeptionally.
  1787  					// Here we makes a new clone of the original backoffer for this goroutine
  1788  					// exclusively to avoid the data race when using the same backoffer
  1789  					// in concurrent goroutines.
  1790  					singleBatchBackoffer = batchExe.backoffer.Clone()
  1791  				} else {
  1792  					var singleBatchCancel context.CancelFunc
  1793  					singleBatchBackoffer, singleBatchCancel = batchExe.backoffer.Fork()
  1794  					defer singleBatchCancel()
  1795  				}
  1796  				ch <- batchExe.action.handleSingleBatch(batchExe.committer, singleBatchBackoffer, batch)
  1797  				commitDetail := batchExe.committer.getDetail()
  1798  				// For prewrite, we record the max backoff time
  1799  				if _, ok := batchExe.action.(actionPrewrite); ok {
  1800  					commitDetail.Mu.Lock()
  1801  					boSleep := int64(singleBatchBackoffer.GetTotalSleep()) * int64(time.Millisecond)
  1802  					if boSleep > commitDetail.Mu.CommitBackoffTime {
  1803  						commitDetail.Mu.CommitBackoffTime = boSleep
  1804  						commitDetail.Mu.BackoffTypes = singleBatchBackoffer.GetTypes()
  1805  					}
  1806  					commitDetail.Mu.Unlock()
  1807  				}
  1808  				// Backoff time in the 2nd phase of a non-async-commit txn is added
  1809  				// in the commitTxn method, so we don't add it here.
  1810  			}()
  1811  		} else {
  1812  			logutil.Logger(batchExe.backoffer.GetCtx()).Info("break startWorker",
  1813  				zap.Stringer("action", batchExe.action), zap.Int("batch size", len(batches)),
  1814  				zap.Int("index", idx))
  1815  			break
  1816  		}
  1817  	}
  1818  }
  1819  
  1820  // process will start worker routine and collect results
  1821  func (batchExe *batchExecutor) process(batches []batchMutations) error {
  1822  	var err error
  1823  	err = batchExe.initUtils()
  1824  	if err != nil {
  1825  		logutil.Logger(batchExe.backoffer.GetCtx()).Error("batchExecutor initUtils failed", zap.Error(err))
  1826  		return err
  1827  	}
  1828  
  1829  	// For prewrite, stop sending other requests after receiving first error.
  1830  	var cancel context.CancelFunc
  1831  	if _, ok := batchExe.action.(actionPrewrite); ok {
  1832  		batchExe.backoffer, cancel = batchExe.backoffer.Fork()
  1833  		defer cancel()
  1834  	}
  1835  	// concurrently do the work for each batch.
  1836  	ch := make(chan error, len(batches))
  1837  	exitCh := make(chan struct{})
  1838  	go batchExe.startWorker(exitCh, ch, batches)
  1839  	// check results
  1840  	for i := 0; i < len(batches); i++ {
  1841  		if e := <-ch; e != nil {
  1842  			logutil.Logger(batchExe.backoffer.GetCtx()).Debug("2PC doActionOnBatch failed",
  1843  				zap.Uint64("session", batchExe.committer.sessionID),
  1844  				zap.Stringer("action type", batchExe.action),
  1845  				zap.Error(e),
  1846  				zap.Uint64("txnStartTS", batchExe.committer.startTS))
  1847  			// Cancel other requests and return the first error.
  1848  			if cancel != nil {
  1849  				logutil.Logger(batchExe.backoffer.GetCtx()).Debug("2PC doActionOnBatch to cancel other actions",
  1850  					zap.Uint64("session", batchExe.committer.sessionID),
  1851  					zap.Stringer("action type", batchExe.action),
  1852  					zap.Uint64("txnStartTS", batchExe.committer.startTS))
  1853  				atomic.StoreUint32(&batchExe.committer.prewriteCancelled, 1)
  1854  				cancel()
  1855  			}
  1856  			if err == nil {
  1857  				err = e
  1858  			}
  1859  		}
  1860  	}
  1861  	close(exitCh)
  1862  	if batchExe.tokenWaitDuration > 0 {
  1863  		metrics.TiKVTokenWaitDuration.Observe(float64(batchExe.tokenWaitDuration.Nanoseconds()))
  1864  	}
  1865  	return err
  1866  }
  1867  
  1868  func (c *twoPhaseCommitter) setDetail(d *util.CommitDetails) {
  1869  	atomic.StorePointer(&c.detail, unsafe.Pointer(d))
  1870  }
  1871  
  1872  func (c *twoPhaseCommitter) getDetail() *util.CommitDetails {
  1873  	return (*util.CommitDetails)(atomic.LoadPointer(&c.detail))
  1874  }
  1875  
  1876  func (c *twoPhaseCommitter) setUndeterminedErr(err error) {
  1877  	c.mu.Lock()
  1878  	defer c.mu.Unlock()
  1879  	c.mu.undeterminedErr = err
  1880  }
  1881  
  1882  func (c *twoPhaseCommitter) getUndeterminedErr() error {
  1883  	c.mu.RLock()
  1884  	defer c.mu.RUnlock()
  1885  	return c.mu.undeterminedErr
  1886  }
  1887  
  1888  func (c *twoPhaseCommitter) mutationsOfKeys(keys [][]byte) CommitterMutations {
  1889  	var res PlainMutations
  1890  	for i := 0; i < c.mutations.Len(); i++ {
  1891  		for _, key := range keys {
  1892  			if bytes.Equal(c.mutations.GetKey(i), key) {
  1893  				res.Push(c.mutations.GetOp(i), c.mutations.GetKey(i), c.mutations.GetValue(i), c.mutations.IsPessimisticLock(i))
  1894  				break
  1895  			}
  1896  		}
  1897  	}
  1898  	return &res
  1899  }