github.com/KinWaiYuen/client-go/v2@v2.5.4/txnkv/transaction/2pc.go (about) 1 // Copyright 2021 TiKV Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // NOTE: The code in this file is based on code from the 16 // TiDB project, licensed under the Apache License v 2.0 17 // 18 // https://github.com/pingcap/tidb/tree/cc5e161ac06827589c4966674597c137cc9e809c/store/tikv/2pc.go 19 // 20 21 // Copyright 2016 PingCAP, Inc. 22 // 23 // Licensed under the Apache License, Version 2.0 (the "License"); 24 // you may not use this file except in compliance with the License. 25 // You may obtain a copy of the License at 26 // 27 // http://www.apache.org/licenses/LICENSE-2.0 28 // 29 // Unless required by applicable law or agreed to in writing, software 30 // distributed under the License is distributed on an "AS IS" BASIS, 31 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 32 // See the License for the specific language governing permissions and 33 // limitations under the License. 34 35 package transaction 36 37 import ( 38 "bytes" 39 "context" 40 "encoding/hex" 41 "math" 42 "math/rand" 43 "strings" 44 "sync" 45 "sync/atomic" 46 "time" 47 "unsafe" 48 49 "github.com/KinWaiYuen/client-go/v2/config" 50 tikverr "github.com/KinWaiYuen/client-go/v2/error" 51 "github.com/KinWaiYuen/client-go/v2/internal/client" 52 "github.com/KinWaiYuen/client-go/v2/internal/latch" 53 "github.com/KinWaiYuen/client-go/v2/internal/locate" 54 "github.com/KinWaiYuen/client-go/v2/internal/logutil" 55 "github.com/KinWaiYuen/client-go/v2/internal/retry" 56 "github.com/KinWaiYuen/client-go/v2/internal/unionstore" 57 "github.com/KinWaiYuen/client-go/v2/kv" 58 "github.com/KinWaiYuen/client-go/v2/metrics" 59 "github.com/KinWaiYuen/client-go/v2/oracle" 60 "github.com/KinWaiYuen/client-go/v2/tikvrpc" 61 "github.com/KinWaiYuen/client-go/v2/txnkv/txnlock" 62 "github.com/KinWaiYuen/client-go/v2/util" 63 "github.com/pingcap/errors" 64 "github.com/pingcap/kvproto/pkg/kvrpcpb" 65 "github.com/pingcap/parser/terror" 66 "github.com/prometheus/client_golang/prometheus" 67 zap "go.uber.org/zap" 68 ) 69 70 // If the duration of a single request exceeds the slowRequestThreshold, a warning log will be logged. 71 const slowRequestThreshold = time.Minute 72 73 type twoPhaseCommitAction interface { 74 handleSingleBatch(*twoPhaseCommitter, *retry.Backoffer, batchMutations) error 75 tiKVTxnRegionsNumHistogram() prometheus.Observer 76 String() string 77 } 78 79 // Global variable set by config file. 80 var ( 81 ManagedLockTTL uint64 = 20000 // 20s 82 ) 83 84 var ( 85 // PrewriteMaxBackoff is max sleep time of the `pre-write` command. 86 PrewriteMaxBackoff = 40000 87 // CommitMaxBackoff is max sleep time of the 'commit' command 88 CommitMaxBackoff = uint64(40000) 89 ) 90 91 type kvstore interface { 92 // GetRegionCache gets the RegionCache. 93 GetRegionCache() *locate.RegionCache 94 // SplitRegions splits regions by splitKeys. 95 SplitRegions(ctx context.Context, splitKeys [][]byte, scatter bool, tableID *int64) (regionIDs []uint64, err error) 96 // WaitScatterRegionFinish implements SplittableStore interface. 97 // backOff is the back off time of the wait scatter region.(Milliseconds) 98 // if backOff <= 0, the default wait scatter back off time will be used. 99 WaitScatterRegionFinish(ctx context.Context, regionID uint64, backOff int) error 100 101 // GetTimestampWithRetry returns latest timestamp. 102 GetTimestampWithRetry(bo *retry.Backoffer, scope string) (uint64, error) 103 // GetOracle gets a timestamp oracle client. 104 GetOracle() oracle.Oracle 105 CurrentTimestamp(txnScope string) (uint64, error) 106 // SendReq sends a request to TiKV. 107 SendReq(bo *retry.Backoffer, req *tikvrpc.Request, regionID locate.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) 108 // GetTiKVClient gets the client instance. 109 GetTiKVClient() (client client.Client) 110 GetLockResolver() *txnlock.LockResolver 111 Ctx() context.Context 112 WaitGroup() *sync.WaitGroup 113 // TxnLatches returns txnLatches. 114 TxnLatches() *latch.LatchesScheduler 115 GetClusterID() uint64 116 } 117 118 // twoPhaseCommitter executes a two-phase commit protocol. 119 type twoPhaseCommitter struct { 120 store kvstore 121 txn *KVTxn 122 startTS uint64 123 mutations *memBufferMutations 124 lockTTL uint64 125 commitTS uint64 126 priority kvrpcpb.CommandPri 127 sessionID uint64 // sessionID is used for log. 128 cleanWg sync.WaitGroup 129 detail unsafe.Pointer 130 txnSize int 131 hasNoNeedCommitKeys bool 132 133 primaryKey []byte 134 forUpdateTS uint64 135 136 mu struct { 137 sync.RWMutex 138 undeterminedErr error // undeterminedErr saves the rpc error we encounter when commit primary key. 139 committed bool 140 } 141 syncLog bool 142 // For pessimistic transaction 143 isPessimistic bool 144 isFirstLock bool 145 // regionTxnSize stores the number of keys involved in each region 146 regionTxnSize map[uint64]int 147 // Used by pessimistic transaction and large transaction. 148 ttlManager 149 150 testingKnobs struct { 151 acAfterCommitPrimary chan struct{} 152 bkAfterCommitPrimary chan struct{} 153 noFallBack bool 154 } 155 156 useAsyncCommit uint32 157 minCommitTS uint64 158 maxCommitTS uint64 159 prewriteStarted bool 160 prewriteCancelled uint32 161 useOnePC uint32 162 onePCCommitTS uint64 163 164 hasTriedAsyncCommit bool 165 hasTriedOnePC bool 166 167 // doingAmend means the amend prewrite is ongoing. 168 doingAmend bool 169 170 binlog BinlogExecutor 171 172 resourceGroupTag []byte 173 174 // allowed when tikv disk full happened. 175 diskFullOpt kvrpcpb.DiskFullOpt 176 } 177 178 type memBufferMutations struct { 179 storage *unionstore.MemDB 180 handles []unionstore.MemKeyHandle 181 } 182 183 func newMemBufferMutations(sizeHint int, storage *unionstore.MemDB) *memBufferMutations { 184 return &memBufferMutations{ 185 handles: make([]unionstore.MemKeyHandle, 0, sizeHint), 186 storage: storage, 187 } 188 } 189 190 func (m *memBufferMutations) Len() int { 191 return len(m.handles) 192 } 193 194 func (m *memBufferMutations) GetKey(i int) []byte { 195 return m.storage.GetKeyByHandle(m.handles[i]) 196 } 197 198 func (m *memBufferMutations) GetKeys() [][]byte { 199 ret := make([][]byte, m.Len()) 200 for i := range ret { 201 ret[i] = m.GetKey(i) 202 } 203 return ret 204 } 205 206 func (m *memBufferMutations) GetValue(i int) []byte { 207 v, _ := m.storage.GetValueByHandle(m.handles[i]) 208 return v 209 } 210 211 func (m *memBufferMutations) GetOp(i int) kvrpcpb.Op { 212 return kvrpcpb.Op(m.handles[i].UserData >> 1) 213 } 214 215 func (m *memBufferMutations) IsPessimisticLock(i int) bool { 216 return m.handles[i].UserData&1 != 0 217 } 218 219 func (m *memBufferMutations) Slice(from, to int) CommitterMutations { 220 return &memBufferMutations{ 221 handles: m.handles[from:to], 222 storage: m.storage, 223 } 224 } 225 226 func (m *memBufferMutations) Push(op kvrpcpb.Op, isPessimisticLock bool, handle unionstore.MemKeyHandle) { 227 aux := uint16(op) << 1 228 if isPessimisticLock { 229 aux |= 1 230 } 231 handle.UserData = aux 232 m.handles = append(m.handles, handle) 233 } 234 235 // CommitterMutations contains the mutations to be submitted. 236 type CommitterMutations interface { 237 Len() int 238 GetKey(i int) []byte 239 GetKeys() [][]byte 240 GetOp(i int) kvrpcpb.Op 241 GetValue(i int) []byte 242 IsPessimisticLock(i int) bool 243 Slice(from, to int) CommitterMutations 244 } 245 246 // PlainMutations contains transaction operations. 247 type PlainMutations struct { 248 ops []kvrpcpb.Op 249 keys [][]byte 250 values [][]byte 251 isPessimisticLock []bool 252 } 253 254 // NewPlainMutations creates a PlainMutations object with sizeHint reserved. 255 func NewPlainMutations(sizeHint int) PlainMutations { 256 return PlainMutations{ 257 ops: make([]kvrpcpb.Op, 0, sizeHint), 258 keys: make([][]byte, 0, sizeHint), 259 values: make([][]byte, 0, sizeHint), 260 isPessimisticLock: make([]bool, 0, sizeHint), 261 } 262 } 263 264 // Slice return a sub mutations in range [from, to). 265 func (c *PlainMutations) Slice(from, to int) CommitterMutations { 266 var res PlainMutations 267 res.keys = c.keys[from:to] 268 if c.ops != nil { 269 res.ops = c.ops[from:to] 270 } 271 if c.values != nil { 272 res.values = c.values[from:to] 273 } 274 if c.isPessimisticLock != nil { 275 res.isPessimisticLock = c.isPessimisticLock[from:to] 276 } 277 return &res 278 } 279 280 // Push another mutation into mutations. 281 func (c *PlainMutations) Push(op kvrpcpb.Op, key []byte, value []byte, isPessimisticLock bool) { 282 c.ops = append(c.ops, op) 283 c.keys = append(c.keys, key) 284 c.values = append(c.values, value) 285 c.isPessimisticLock = append(c.isPessimisticLock, isPessimisticLock) 286 } 287 288 // Len returns the count of mutations. 289 func (c *PlainMutations) Len() int { 290 return len(c.keys) 291 } 292 293 // GetKey returns the key at index. 294 func (c *PlainMutations) GetKey(i int) []byte { 295 return c.keys[i] 296 } 297 298 // GetKeys returns the keys. 299 func (c *PlainMutations) GetKeys() [][]byte { 300 return c.keys 301 } 302 303 // GetOps returns the key ops. 304 func (c *PlainMutations) GetOps() []kvrpcpb.Op { 305 return c.ops 306 } 307 308 // GetValues returns the key values. 309 func (c *PlainMutations) GetValues() [][]byte { 310 return c.values 311 } 312 313 // GetPessimisticFlags returns the key pessimistic flags. 314 func (c *PlainMutations) GetPessimisticFlags() []bool { 315 return c.isPessimisticLock 316 } 317 318 // GetOp returns the key op at index. 319 func (c *PlainMutations) GetOp(i int) kvrpcpb.Op { 320 return c.ops[i] 321 } 322 323 // GetValue returns the key value at index. 324 func (c *PlainMutations) GetValue(i int) []byte { 325 if len(c.values) <= i { 326 return nil 327 } 328 return c.values[i] 329 } 330 331 // IsPessimisticLock returns the key pessimistic flag at index. 332 func (c *PlainMutations) IsPessimisticLock(i int) bool { 333 return c.isPessimisticLock[i] 334 } 335 336 // PlainMutation represents a single transaction operation. 337 type PlainMutation struct { 338 KeyOp kvrpcpb.Op 339 Key []byte 340 Value []byte 341 IsPessimisticLock bool 342 } 343 344 // MergeMutations append input mutations into current mutations. 345 func (c *PlainMutations) MergeMutations(mutations PlainMutations) { 346 c.ops = append(c.ops, mutations.ops...) 347 c.keys = append(c.keys, mutations.keys...) 348 c.values = append(c.values, mutations.values...) 349 c.isPessimisticLock = append(c.isPessimisticLock, mutations.isPessimisticLock...) 350 } 351 352 // AppendMutation merges a single Mutation into the current mutations. 353 func (c *PlainMutations) AppendMutation(mutation PlainMutation) { 354 c.ops = append(c.ops, mutation.KeyOp) 355 c.keys = append(c.keys, mutation.Key) 356 c.values = append(c.values, mutation.Value) 357 c.isPessimisticLock = append(c.isPessimisticLock, mutation.IsPessimisticLock) 358 } 359 360 // newTwoPhaseCommitter creates a twoPhaseCommitter. 361 func newTwoPhaseCommitter(txn *KVTxn, sessionID uint64) (*twoPhaseCommitter, error) { 362 return &twoPhaseCommitter{ 363 store: txn.store, 364 txn: txn, 365 startTS: txn.StartTS(), 366 sessionID: sessionID, 367 regionTxnSize: map[uint64]int{}, 368 isPessimistic: txn.IsPessimistic(), 369 binlog: txn.binlog, 370 diskFullOpt: kvrpcpb.DiskFullOpt_NotAllowedOnFull, 371 }, nil 372 } 373 374 func (c *twoPhaseCommitter) extractKeyExistsErr(err *tikverr.ErrKeyExist) error { 375 if !c.txn.us.HasPresumeKeyNotExists(err.GetKey()) { 376 return errors.Errorf("session %d, existErr for key:%s should not be nil", c.sessionID, err.GetKey()) 377 } 378 return errors.Trace(err) 379 } 380 381 // KVFilter is a filter that filters out unnecessary KV pairs. 382 type KVFilter interface { 383 // IsUnnecessaryKeyValue returns whether this KV pair should be committed. 384 IsUnnecessaryKeyValue(key, value []byte, flags kv.KeyFlags) bool 385 } 386 387 func (c *twoPhaseCommitter) initKeysAndMutations() error { 388 var size, putCnt, delCnt, lockCnt, checkCnt int 389 390 txn := c.txn 391 memBuf := txn.GetMemBuffer() 392 sizeHint := txn.us.GetMemBuffer().Len() 393 c.mutations = newMemBufferMutations(sizeHint, memBuf) 394 c.isPessimistic = txn.IsPessimistic() 395 filter := txn.kvFilter 396 397 var err error 398 for it := memBuf.IterWithFlags(nil, nil); it.Valid(); err = it.Next() { 399 _ = err 400 key := it.Key() 401 flags := it.Flags() 402 var value []byte 403 var op kvrpcpb.Op 404 405 if !it.HasValue() { 406 if !flags.HasLocked() { 407 continue 408 } 409 op = kvrpcpb.Op_Lock 410 lockCnt++ 411 } else { 412 value = it.Value() 413 isUnnecessaryKV := filter != nil && filter.IsUnnecessaryKeyValue(key, value, flags) 414 if len(value) > 0 { 415 if isUnnecessaryKV { 416 if !flags.HasLocked() { 417 continue 418 } 419 // If the key was locked before, we should prewrite the lock even if 420 // the KV needn't be committed according to the filter. Otherwise, we 421 // were forgetting removing pessimistic locks added before. 422 op = kvrpcpb.Op_Lock 423 lockCnt++ 424 } else { 425 op = kvrpcpb.Op_Put 426 if flags.HasPresumeKeyNotExists() { 427 op = kvrpcpb.Op_Insert 428 } 429 putCnt++ 430 } 431 } else { 432 if isUnnecessaryKV { 433 continue 434 } 435 if !txn.IsPessimistic() && flags.HasPresumeKeyNotExists() { 436 // delete-your-writes keys in optimistic txn need check not exists in prewrite-phase 437 // due to `Op_CheckNotExists` doesn't prewrite lock, so mark those keys should not be used in commit-phase. 438 op = kvrpcpb.Op_CheckNotExists 439 checkCnt++ 440 memBuf.UpdateFlags(key, kv.SetPrewriteOnly) 441 } else { 442 // normal delete keys in optimistic txn can be delete without not exists checking 443 // delete-your-writes keys in pessimistic txn can ensure must be no exists so can directly delete them 444 op = kvrpcpb.Op_Del 445 delCnt++ 446 } 447 } 448 } 449 450 var isPessimistic bool 451 if flags.HasLocked() { 452 isPessimistic = c.isPessimistic 453 } 454 c.mutations.Push(op, isPessimistic, it.Handle()) 455 size += len(key) + len(value) 456 457 if len(c.primaryKey) == 0 && op != kvrpcpb.Op_CheckNotExists { 458 c.primaryKey = key 459 } 460 } 461 462 if c.mutations.Len() == 0 { 463 return nil 464 } 465 c.txnSize = size 466 467 const logEntryCount = 10000 468 const logSize = 4 * 1024 * 1024 // 4MB 469 if c.mutations.Len() > logEntryCount || size > logSize { 470 logutil.BgLogger().Info("[BIG_TXN]", 471 zap.Uint64("session", c.sessionID), 472 zap.String("key sample", kv.StrKey(c.mutations.GetKey(0))), 473 zap.Int("size", size), 474 zap.Int("keys", c.mutations.Len()), 475 zap.Int("puts", putCnt), 476 zap.Int("dels", delCnt), 477 zap.Int("locks", lockCnt), 478 zap.Int("checks", checkCnt), 479 zap.Uint64("txnStartTS", txn.startTS)) 480 } 481 482 // Sanity check for startTS. 483 if txn.StartTS() == math.MaxUint64 { 484 err = errors.Errorf("try to commit with invalid txnStartTS: %d", txn.StartTS()) 485 logutil.BgLogger().Error("commit failed", 486 zap.Uint64("session", c.sessionID), 487 zap.Error(err)) 488 return errors.Trace(err) 489 } 490 491 commitDetail := &util.CommitDetails{WriteSize: size, WriteKeys: c.mutations.Len()} 492 metrics.TiKVTxnWriteKVCountHistogram.Observe(float64(commitDetail.WriteKeys)) 493 metrics.TiKVTxnWriteSizeHistogram.Observe(float64(commitDetail.WriteSize)) 494 c.hasNoNeedCommitKeys = checkCnt > 0 495 c.lockTTL = txnLockTTL(txn.startTime, size) 496 c.priority = txn.priority.ToPB() 497 c.syncLog = txn.syncLog 498 c.resourceGroupTag = txn.resourceGroupTag 499 c.setDetail(commitDetail) 500 return nil 501 } 502 503 func (c *twoPhaseCommitter) primary() []byte { 504 if len(c.primaryKey) == 0 { 505 return c.mutations.GetKey(0) 506 } 507 return c.primaryKey 508 } 509 510 // asyncSecondaries returns all keys that must be checked in the recovery phase of an async commit. 511 func (c *twoPhaseCommitter) asyncSecondaries() [][]byte { 512 secondaries := make([][]byte, 0, c.mutations.Len()) 513 for i := 0; i < c.mutations.Len(); i++ { 514 k := c.mutations.GetKey(i) 515 if bytes.Equal(k, c.primary()) || c.mutations.GetOp(i) == kvrpcpb.Op_CheckNotExists { 516 continue 517 } 518 secondaries = append(secondaries, k) 519 } 520 return secondaries 521 } 522 523 const bytesPerMiB = 1024 * 1024 524 525 // ttl = ttlFactor * sqrt(writeSizeInMiB) 526 var ttlFactor = 6000 527 528 // By default, locks after 3000ms is considered unusual (the client created the 529 // lock might be dead). Other client may cleanup this kind of lock. 530 // For locks created recently, we will do backoff and retry. 531 var defaultLockTTL uint64 = 3000 532 533 func txnLockTTL(startTime time.Time, txnSize int) uint64 { 534 // Increase lockTTL for large transactions. 535 // The formula is `ttl = ttlFactor * sqrt(sizeInMiB)`. 536 // When writeSize is less than 256KB, the base ttl is defaultTTL (3s); 537 // When writeSize is 1MiB, 4MiB, or 10MiB, ttl is 6s, 12s, 20s correspondingly; 538 lockTTL := defaultLockTTL 539 if txnSize >= txnCommitBatchSize { 540 sizeMiB := float64(txnSize) / bytesPerMiB 541 lockTTL = uint64(float64(ttlFactor) * math.Sqrt(sizeMiB)) 542 if lockTTL < defaultLockTTL { 543 lockTTL = defaultLockTTL 544 } 545 if lockTTL > ManagedLockTTL { 546 lockTTL = ManagedLockTTL 547 } 548 } 549 550 // Increase lockTTL by the transaction's read time. 551 // When resolving a lock, we compare current ts and startTS+lockTTL to decide whether to clean up. If a txn 552 // takes a long time to read, increasing its TTL will help to prevent it from been aborted soon after prewrite. 553 elapsed := time.Since(startTime) / time.Millisecond 554 return lockTTL + uint64(elapsed) 555 } 556 557 var preSplitDetectThreshold uint32 = 100000 558 var preSplitSizeThreshold uint32 = 32 << 20 559 560 // doActionOnMutations groups keys into primary batch and secondary batches, if primary batch exists in the key, 561 // it does action on primary batch first, then on secondary batches. If action is commit, secondary batches 562 // is done in background goroutine. 563 func (c *twoPhaseCommitter) doActionOnMutations(bo *retry.Backoffer, action twoPhaseCommitAction, mutations CommitterMutations) error { 564 if mutations.Len() == 0 { 565 return nil 566 } 567 groups, err := c.groupMutations(bo, mutations) 568 if err != nil { 569 return errors.Trace(err) 570 } 571 572 // This is redundant since `doActionOnGroupMutations` will still split groups into batches and 573 // check the number of batches. However we don't want the check fail after any code changes. 574 c.checkOnePCFallBack(action, len(groups)) 575 576 return c.doActionOnGroupMutations(bo, action, groups) 577 } 578 579 type groupedMutations struct { 580 region locate.RegionVerID 581 mutations CommitterMutations 582 } 583 584 // groupSortedMutationsByRegion separates keys into groups by their belonging Regions. 585 func groupSortedMutationsByRegion(c *locate.RegionCache, bo *retry.Backoffer, m CommitterMutations) ([]groupedMutations, error) { 586 var ( 587 groups []groupedMutations 588 lastLoc *locate.KeyLocation 589 ) 590 lastUpperBound := 0 591 for i := 0; i < m.Len(); i++ { 592 if lastLoc == nil || !lastLoc.Contains(m.GetKey(i)) { 593 if lastLoc != nil { 594 groups = append(groups, groupedMutations{ 595 region: lastLoc.Region, 596 mutations: m.Slice(lastUpperBound, i), 597 }) 598 lastUpperBound = i 599 } 600 var err error 601 lastLoc, err = c.LocateKey(bo, m.GetKey(i)) 602 if err != nil { 603 return nil, errors.Trace(err) 604 } 605 } 606 } 607 if lastLoc != nil { 608 groups = append(groups, groupedMutations{ 609 region: lastLoc.Region, 610 mutations: m.Slice(lastUpperBound, m.Len()), 611 }) 612 } 613 return groups, nil 614 } 615 616 // groupMutations groups mutations by region, then checks for any large groups and in that case pre-splits the region. 617 func (c *twoPhaseCommitter) groupMutations(bo *retry.Backoffer, mutations CommitterMutations) ([]groupedMutations, error) { 618 groups, err := groupSortedMutationsByRegion(c.store.GetRegionCache(), bo, mutations) 619 if err != nil { 620 return nil, errors.Trace(err) 621 } 622 623 // Pre-split regions to avoid too much write workload into a single region. 624 // In the large transaction case, this operation is important to avoid TiKV 'server is busy' error. 625 var didPreSplit bool 626 preSplitDetectThresholdVal := atomic.LoadUint32(&preSplitDetectThreshold) 627 for _, group := range groups { 628 if uint32(group.mutations.Len()) >= preSplitDetectThresholdVal { 629 logutil.BgLogger().Info("2PC detect large amount of mutations on a single region", 630 zap.Uint64("region", group.region.GetID()), 631 zap.Int("mutations count", group.mutations.Len())) 632 if c.preSplitRegion(bo.GetCtx(), group) { 633 didPreSplit = true 634 } 635 } 636 } 637 // Reload region cache again. 638 if didPreSplit { 639 groups, err = groupSortedMutationsByRegion(c.store.GetRegionCache(), bo, mutations) 640 if err != nil { 641 return nil, errors.Trace(err) 642 } 643 } 644 645 return groups, nil 646 } 647 648 func (c *twoPhaseCommitter) preSplitRegion(ctx context.Context, group groupedMutations) bool { 649 splitKeys := make([][]byte, 0, 4) 650 651 preSplitSizeThresholdVal := atomic.LoadUint32(&preSplitSizeThreshold) 652 regionSize := 0 653 keysLength := group.mutations.Len() 654 // The value length maybe zero for pessimistic lock keys 655 for i := 0; i < keysLength; i++ { 656 regionSize = regionSize + len(group.mutations.GetKey(i)) + len(group.mutations.GetValue(i)) 657 // The second condition is used for testing. 658 if regionSize >= int(preSplitSizeThresholdVal) { 659 regionSize = 0 660 splitKeys = append(splitKeys, group.mutations.GetKey(i)) 661 } 662 } 663 if len(splitKeys) == 0 { 664 return false 665 } 666 667 regionIDs, err := c.store.SplitRegions(ctx, splitKeys, true, nil) 668 if err != nil { 669 logutil.BgLogger().Warn("2PC split regions failed", zap.Uint64("regionID", group.region.GetID()), 670 zap.Int("keys count", keysLength), zap.Error(err)) 671 return false 672 } 673 674 for _, regionID := range regionIDs { 675 err := c.store.WaitScatterRegionFinish(ctx, regionID, 0) 676 if err != nil { 677 logutil.BgLogger().Warn("2PC wait scatter region failed", zap.Uint64("regionID", regionID), zap.Error(err)) 678 } 679 } 680 // Invalidate the old region cache information. 681 c.store.GetRegionCache().InvalidateCachedRegion(group.region) 682 return true 683 } 684 685 // CommitSecondaryMaxBackoff is max sleep time of the 'commit' command 686 const CommitSecondaryMaxBackoff = 41000 687 688 // doActionOnGroupedMutations splits groups into batches (there is one group per region, and potentially many batches per group, but all mutations 689 // in a batch will belong to the same region). 690 func (c *twoPhaseCommitter) doActionOnGroupMutations(bo *retry.Backoffer, action twoPhaseCommitAction, groups []groupedMutations) error { 691 action.tiKVTxnRegionsNumHistogram().Observe(float64(len(groups))) 692 693 var sizeFunc = c.keySize 694 695 switch act := action.(type) { 696 case actionPrewrite: 697 // Do not update regionTxnSize on retries. They are not used when building a PrewriteRequest. 698 if !act.retry { 699 for _, group := range groups { 700 c.regionTxnSize[group.region.GetID()] = group.mutations.Len() 701 } 702 } 703 sizeFunc = c.keyValueSize 704 atomic.AddInt32(&c.getDetail().PrewriteRegionNum, int32(len(groups))) 705 case actionPessimisticLock: 706 if act.LockCtx.Stats != nil { 707 act.LockCtx.Stats.RegionNum = int32(len(groups)) 708 } 709 } 710 711 batchBuilder := newBatched(c.primary()) 712 for _, group := range groups { 713 batchBuilder.appendBatchMutationsBySize(group.region, group.mutations, sizeFunc, txnCommitBatchSize) 714 } 715 firstIsPrimary := batchBuilder.setPrimary() 716 717 actionCommit, actionIsCommit := action.(actionCommit) 718 _, actionIsCleanup := action.(actionCleanup) 719 _, actionIsPessimisticLock := action.(actionPessimisticLock) 720 721 c.checkOnePCFallBack(action, len(batchBuilder.allBatches())) 722 723 var err error 724 if val, err := util.EvalFailpoint("skipKeyReturnOK"); err == nil { 725 valStr, ok := val.(string) 726 if ok && c.sessionID > 0 { 727 if firstIsPrimary && actionIsPessimisticLock { 728 logutil.Logger(bo.GetCtx()).Warn("pessimisticLock failpoint", zap.String("valStr", valStr)) 729 switch valStr { 730 case "pessimisticLockSkipPrimary": 731 err = c.doActionOnBatches(bo, action, batchBuilder.allBatches()) 732 return err 733 case "pessimisticLockSkipSecondary": 734 err = c.doActionOnBatches(bo, action, batchBuilder.primaryBatch()) 735 return err 736 } 737 } 738 } 739 } 740 if _, err := util.EvalFailpoint("pessimisticRollbackDoNth"); err == nil { 741 _, actionIsPessimisticRollback := action.(actionPessimisticRollback) 742 if actionIsPessimisticRollback && c.sessionID > 0 { 743 logutil.Logger(bo.GetCtx()).Warn("pessimisticRollbackDoNth failpoint") 744 return nil 745 } 746 } 747 748 if firstIsPrimary && 749 ((actionIsCommit && !c.isAsyncCommit()) || actionIsCleanup || actionIsPessimisticLock) { 750 // primary should be committed(not async commit)/cleanup/pessimistically locked first 751 err = c.doActionOnBatches(bo, action, batchBuilder.primaryBatch()) 752 if err != nil { 753 return errors.Trace(err) 754 } 755 if actionIsCommit && c.testingKnobs.bkAfterCommitPrimary != nil && c.testingKnobs.acAfterCommitPrimary != nil { 756 c.testingKnobs.acAfterCommitPrimary <- struct{}{} 757 <-c.testingKnobs.bkAfterCommitPrimary 758 } 759 batchBuilder.forgetPrimary() 760 } 761 util.EvalFailpoint("afterPrimaryBatch") 762 763 // Already spawned a goroutine for async commit transaction. 764 if actionIsCommit && !actionCommit.retry && !c.isAsyncCommit() { 765 secondaryBo := retry.NewBackofferWithVars(c.store.Ctx(), CommitSecondaryMaxBackoff, c.txn.vars) 766 c.store.WaitGroup().Add(1) 767 go func() { 768 defer c.store.WaitGroup().Done() 769 if c.sessionID > 0 { 770 if v, err := util.EvalFailpoint("beforeCommitSecondaries"); err == nil { 771 if s, ok := v.(string); !ok { 772 logutil.Logger(bo.GetCtx()).Info("[failpoint] sleep 2s before commit secondary keys", 773 zap.Uint64("sessionID", c.sessionID), zap.Uint64("txnStartTS", c.startTS), zap.Uint64("txnCommitTS", c.commitTS)) 774 time.Sleep(2 * time.Second) 775 } else if s == "skip" { 776 logutil.Logger(bo.GetCtx()).Info("[failpoint] injected skip committing secondaries", 777 zap.Uint64("sessionID", c.sessionID), zap.Uint64("txnStartTS", c.startTS), zap.Uint64("txnCommitTS", c.commitTS)) 778 return 779 } 780 } 781 } 782 783 e := c.doActionOnBatches(secondaryBo, action, batchBuilder.allBatches()) 784 if e != nil { 785 logutil.BgLogger().Debug("2PC async doActionOnBatches", 786 zap.Uint64("session", c.sessionID), 787 zap.Stringer("action type", action), 788 zap.Error(e)) 789 metrics.SecondaryLockCleanupFailureCounterCommit.Inc() 790 } 791 }() 792 } else { 793 err = c.doActionOnBatches(bo, action, batchBuilder.allBatches()) 794 } 795 return errors.Trace(err) 796 } 797 798 // doActionOnBatches does action to batches in parallel. 799 func (c *twoPhaseCommitter) doActionOnBatches(bo *retry.Backoffer, action twoPhaseCommitAction, batches []batchMutations) error { 800 if len(batches) == 0 { 801 return nil 802 } 803 804 noNeedFork := len(batches) == 1 805 if !noNeedFork { 806 if ac, ok := action.(actionCommit); ok && ac.retry { 807 noNeedFork = true 808 } 809 } 810 if noNeedFork { 811 for _, b := range batches { 812 e := action.handleSingleBatch(c, bo, b) 813 if e != nil { 814 logutil.BgLogger().Debug("2PC doActionOnBatches failed", 815 zap.Uint64("session", c.sessionID), 816 zap.Stringer("action type", action), 817 zap.Error(e), 818 zap.Uint64("txnStartTS", c.startTS)) 819 return errors.Trace(e) 820 } 821 } 822 return nil 823 } 824 rateLim := len(batches) 825 // Set rateLim here for the large transaction. 826 // If the rate limit is too high, tikv will report service is busy. 827 // If the rate limit is too low, we can't full utilize the tikv's throughput. 828 // TODO: Find a self-adaptive way to control the rate limit here. 829 if rateLim > config.GetGlobalConfig().CommitterConcurrency { 830 rateLim = config.GetGlobalConfig().CommitterConcurrency 831 } 832 batchExecutor := newBatchExecutor(rateLim, c, action, bo) 833 err := batchExecutor.process(batches) 834 return errors.Trace(err) 835 } 836 837 func (c *twoPhaseCommitter) keyValueSize(key, value []byte) int { 838 return len(key) + len(value) 839 } 840 841 func (c *twoPhaseCommitter) keySize(key, value []byte) int { 842 return len(key) 843 } 844 845 func (c *twoPhaseCommitter) SetDiskFullOpt(level kvrpcpb.DiskFullOpt) { 846 c.diskFullOpt = level 847 } 848 849 type ttlManagerState uint32 850 851 const ( 852 stateUninitialized ttlManagerState = iota 853 stateRunning 854 stateClosed 855 ) 856 857 type ttlManager struct { 858 state ttlManagerState 859 ch chan struct{} 860 lockCtx *kv.LockCtx 861 } 862 863 func (tm *ttlManager) run(c *twoPhaseCommitter, lockCtx *kv.LockCtx) { 864 if _, err := util.EvalFailpoint("doNotKeepAlive"); err == nil { 865 return 866 } 867 868 // Run only once. 869 if !atomic.CompareAndSwapUint32((*uint32)(&tm.state), uint32(stateUninitialized), uint32(stateRunning)) { 870 return 871 } 872 tm.ch = make(chan struct{}) 873 tm.lockCtx = lockCtx 874 875 go keepAlive(c, tm.ch, c.primary(), lockCtx) 876 } 877 878 func (tm *ttlManager) close() { 879 if !atomic.CompareAndSwapUint32((*uint32)(&tm.state), uint32(stateRunning), uint32(stateClosed)) { 880 return 881 } 882 close(tm.ch) 883 } 884 885 func (tm *ttlManager) reset() { 886 if !atomic.CompareAndSwapUint32((*uint32)(&tm.state), uint32(stateRunning), uint32(stateUninitialized)) { 887 return 888 } 889 close(tm.ch) 890 } 891 892 const keepAliveMaxBackoff = 20000 // 20 seconds 893 const pessimisticLockMaxBackoff = 600000 // 10 minutes 894 const maxConsecutiveFailure = 10 895 896 func keepAlive(c *twoPhaseCommitter, closeCh chan struct{}, primaryKey []byte, lockCtx *kv.LockCtx) { 897 // Ticker is set to 1/2 of the ManagedLockTTL. 898 ticker := time.NewTicker(time.Duration(atomic.LoadUint64(&ManagedLockTTL)) * time.Millisecond / 2) 899 defer ticker.Stop() 900 keepFail := 0 901 for { 902 select { 903 case <-closeCh: 904 return 905 case <-ticker.C: 906 // If kill signal is received, the ttlManager should exit. 907 if lockCtx != nil && lockCtx.Killed != nil && atomic.LoadUint32(lockCtx.Killed) != 0 { 908 return 909 } 910 bo := retry.NewBackofferWithVars(context.Background(), keepAliveMaxBackoff, c.txn.vars) 911 now, err := c.store.GetTimestampWithRetry(bo, c.txn.GetScope()) 912 if err != nil { 913 logutil.Logger(bo.GetCtx()).Warn("keepAlive get tso fail", 914 zap.Error(err)) 915 return 916 } 917 918 uptime := uint64(oracle.ExtractPhysical(now) - oracle.ExtractPhysical(c.startTS)) 919 if uptime > config.GetGlobalConfig().MaxTxnTTL { 920 // Checks maximum lifetime for the ttlManager, so when something goes wrong 921 // the key will not be locked forever. 922 logutil.Logger(bo.GetCtx()).Info("ttlManager live up to its lifetime", 923 zap.Uint64("txnStartTS", c.startTS), 924 zap.Uint64("uptime", uptime), 925 zap.Uint64("maxTxnTTL", config.GetGlobalConfig().MaxTxnTTL)) 926 metrics.TiKVTTLLifeTimeReachCounter.Inc() 927 // the pessimistic locks may expire if the ttl manager has timed out, set `LockExpired` flag 928 // so that this transaction could only commit or rollback with no more statement executions 929 if c.isPessimistic && lockCtx != nil && lockCtx.LockExpired != nil { 930 atomic.StoreUint32(lockCtx.LockExpired, 1) 931 } 932 return 933 } 934 935 newTTL := uptime + atomic.LoadUint64(&ManagedLockTTL) 936 logutil.Logger(bo.GetCtx()).Info("send TxnHeartBeat", 937 zap.Uint64("startTS", c.startTS), zap.Uint64("newTTL", newTTL)) 938 startTime := time.Now() 939 _, stopHeartBeat, err := sendTxnHeartBeat(bo, c.store, primaryKey, c.startTS, newTTL) 940 if err != nil { 941 keepFail++ 942 metrics.TxnHeartBeatHistogramError.Observe(time.Since(startTime).Seconds()) 943 logutil.Logger(bo.GetCtx()).Debug("send TxnHeartBeat failed", 944 zap.Error(err), 945 zap.Uint64("txnStartTS", c.startTS)) 946 if stopHeartBeat || keepFail > maxConsecutiveFailure { 947 logutil.Logger(bo.GetCtx()).Warn("stop TxnHeartBeat", 948 zap.Error(err), 949 zap.Int("consecutiveFailure", keepFail), 950 zap.Uint64("txnStartTS", c.startTS)) 951 return 952 } 953 continue 954 } 955 keepFail = 0 956 metrics.TxnHeartBeatHistogramOK.Observe(time.Since(startTime).Seconds()) 957 } 958 } 959 } 960 961 func sendTxnHeartBeat(bo *retry.Backoffer, store kvstore, primary []byte, startTS, ttl uint64) (newTTL uint64, stopHeartBeat bool, err error) { 962 req := tikvrpc.NewRequest(tikvrpc.CmdTxnHeartBeat, &kvrpcpb.TxnHeartBeatRequest{ 963 PrimaryLock: primary, 964 StartVersion: startTS, 965 AdviseLockTtl: ttl, 966 }) 967 for { 968 loc, err := store.GetRegionCache().LocateKey(bo, primary) 969 if err != nil { 970 return 0, false, errors.Trace(err) 971 } 972 req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds()) 973 resp, err := store.SendReq(bo, req, loc.Region, client.ReadTimeoutShort) 974 if err != nil { 975 return 0, false, errors.Trace(err) 976 } 977 regionErr, err := resp.GetRegionError() 978 if err != nil { 979 return 0, false, errors.Trace(err) 980 } 981 if regionErr != nil { 982 // For other region error and the fake region error, backoff because 983 // there's something wrong. 984 // For the real EpochNotMatch error, don't backoff. 985 if regionErr.GetEpochNotMatch() == nil || locate.IsFakeRegionError(regionErr) { 986 err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String())) 987 if err != nil { 988 return 0, false, errors.Trace(err) 989 } 990 } 991 continue 992 } 993 if resp.Resp == nil { 994 return 0, false, errors.Trace(tikverr.ErrBodyMissing) 995 } 996 cmdResp := resp.Resp.(*kvrpcpb.TxnHeartBeatResponse) 997 if keyErr := cmdResp.GetError(); keyErr != nil { 998 return 0, true, errors.Errorf("txn %d heartbeat fail, primary key = %v, err = %s", startTS, hex.EncodeToString(primary), tikverr.ExtractKeyErr(keyErr)) 999 } 1000 return cmdResp.GetLockTtl(), false, nil 1001 } 1002 } 1003 1004 // checkAsyncCommit checks if async commit protocol is available for current transaction commit, true is returned if possible. 1005 func (c *twoPhaseCommitter) checkAsyncCommit() bool { 1006 // Disable async commit in local transactions 1007 if c.txn.GetScope() != oracle.GlobalTxnScope { 1008 return false 1009 } 1010 1011 asyncCommitCfg := config.GetGlobalConfig().TiKVClient.AsyncCommit 1012 // TODO the keys limit need more tests, this value makes the unit test pass by now. 1013 // Async commit is not compatible with Binlog because of the non unique timestamp issue. 1014 if c.txn.enableAsyncCommit && 1015 uint(c.mutations.Len()) <= asyncCommitCfg.KeysLimit && 1016 !c.shouldWriteBinlog() { 1017 totalKeySize := uint64(0) 1018 for i := 0; i < c.mutations.Len(); i++ { 1019 totalKeySize += uint64(len(c.mutations.GetKey(i))) 1020 if totalKeySize > asyncCommitCfg.TotalKeySizeLimit { 1021 return false 1022 } 1023 } 1024 return true 1025 } 1026 return false 1027 } 1028 1029 // checkOnePC checks if 1PC protocol is available for current transaction. 1030 func (c *twoPhaseCommitter) checkOnePC() bool { 1031 // Disable 1PC in local transactions 1032 if c.txn.GetScope() != oracle.GlobalTxnScope { 1033 return false 1034 } 1035 1036 return !c.shouldWriteBinlog() && c.txn.enable1PC 1037 } 1038 1039 func (c *twoPhaseCommitter) needLinearizability() bool { 1040 return !c.txn.causalConsistency 1041 } 1042 1043 func (c *twoPhaseCommitter) isAsyncCommit() bool { 1044 return atomic.LoadUint32(&c.useAsyncCommit) > 0 1045 } 1046 1047 func (c *twoPhaseCommitter) setAsyncCommit(val bool) { 1048 if val { 1049 atomic.StoreUint32(&c.useAsyncCommit, 1) 1050 } else { 1051 atomic.StoreUint32(&c.useAsyncCommit, 0) 1052 } 1053 } 1054 1055 func (c *twoPhaseCommitter) isOnePC() bool { 1056 return atomic.LoadUint32(&c.useOnePC) > 0 1057 } 1058 1059 func (c *twoPhaseCommitter) setOnePC(val bool) { 1060 if val { 1061 atomic.StoreUint32(&c.useOnePC, 1) 1062 } else { 1063 atomic.StoreUint32(&c.useOnePC, 0) 1064 } 1065 } 1066 1067 func (c *twoPhaseCommitter) checkOnePCFallBack(action twoPhaseCommitAction, batchCount int) { 1068 if _, ok := action.(actionPrewrite); ok { 1069 if batchCount > 1 { 1070 c.setOnePC(false) 1071 } 1072 } 1073 } 1074 1075 const ( 1076 cleanupMaxBackoff = 20000 1077 // TsoMaxBackoff is the max sleep time to get tso. 1078 TsoMaxBackoff = 15000 1079 ) 1080 1081 func (c *twoPhaseCommitter) cleanup(ctx context.Context) { 1082 c.cleanWg.Add(1) 1083 c.store.WaitGroup().Add(1) 1084 go func() { 1085 defer c.store.WaitGroup().Done() 1086 if _, err := util.EvalFailpoint("commitFailedSkipCleanup"); err == nil { 1087 logutil.Logger(ctx).Info("[failpoint] injected skip cleanup secondaries on failure", 1088 zap.Uint64("txnStartTS", c.startTS)) 1089 c.cleanWg.Done() 1090 return 1091 } 1092 1093 cleanupKeysCtx := context.WithValue(c.store.Ctx(), retry.TxnStartKey, ctx.Value(retry.TxnStartKey)) 1094 var err error 1095 if !c.isOnePC() { 1096 err = c.cleanupMutations(retry.NewBackofferWithVars(cleanupKeysCtx, cleanupMaxBackoff, c.txn.vars), c.mutations) 1097 } else if c.isPessimistic { 1098 err = c.pessimisticRollbackMutations(retry.NewBackofferWithVars(cleanupKeysCtx, cleanupMaxBackoff, c.txn.vars), c.mutations) 1099 } 1100 1101 if err != nil { 1102 metrics.SecondaryLockCleanupFailureCounterRollback.Inc() 1103 logutil.Logger(ctx).Info("2PC cleanup failed", zap.Error(err), zap.Uint64("txnStartTS", c.startTS), 1104 zap.Bool("isPessimistic", c.isPessimistic), zap.Bool("isOnePC", c.isOnePC())) 1105 } else { 1106 logutil.Logger(ctx).Debug("2PC clean up done", 1107 zap.Uint64("txnStartTS", c.startTS), zap.Bool("isPessimistic", c.isPessimistic), 1108 zap.Bool("isOnePC", c.isOnePC())) 1109 } 1110 c.cleanWg.Done() 1111 }() 1112 } 1113 1114 // execute executes the two-phase commit protocol. 1115 func (c *twoPhaseCommitter) execute(ctx context.Context) (err error) { 1116 var binlogSkipped bool 1117 defer func() { 1118 if c.isOnePC() { 1119 // The error means the 1PC transaction failed. 1120 if err != nil { 1121 if c.getUndeterminedErr() == nil { 1122 c.cleanup(ctx) 1123 } 1124 metrics.OnePCTxnCounterError.Inc() 1125 } else { 1126 metrics.OnePCTxnCounterOk.Inc() 1127 } 1128 } else if c.isAsyncCommit() { 1129 // The error means the async commit should not succeed. 1130 if err != nil { 1131 if c.getUndeterminedErr() == nil { 1132 c.cleanup(ctx) 1133 } 1134 metrics.AsyncCommitTxnCounterError.Inc() 1135 } else { 1136 metrics.AsyncCommitTxnCounterOk.Inc() 1137 } 1138 } else { 1139 // Always clean up all written keys if the txn does not commit. 1140 c.mu.RLock() 1141 committed := c.mu.committed 1142 undetermined := c.mu.undeterminedErr != nil 1143 c.mu.RUnlock() 1144 if !committed && !undetermined { 1145 c.cleanup(ctx) 1146 metrics.TwoPCTxnCounterError.Inc() 1147 } else { 1148 metrics.TwoPCTxnCounterOk.Inc() 1149 } 1150 c.txn.commitTS = c.commitTS 1151 if binlogSkipped { 1152 c.binlog.Skip() 1153 return 1154 } 1155 if !c.shouldWriteBinlog() { 1156 return 1157 } 1158 if err != nil { 1159 c.binlog.Commit(ctx, 0) 1160 } else { 1161 c.binlog.Commit(ctx, int64(c.commitTS)) 1162 } 1163 } 1164 }() 1165 1166 commitTSMayBeCalculated := false 1167 // Check async commit is available or not. 1168 if c.checkAsyncCommit() { 1169 commitTSMayBeCalculated = true 1170 c.setAsyncCommit(true) 1171 c.hasTriedAsyncCommit = true 1172 } 1173 // Check if 1PC is enabled. 1174 if c.checkOnePC() { 1175 commitTSMayBeCalculated = true 1176 c.setOnePC(true) 1177 c.hasTriedOnePC = true 1178 } 1179 1180 // TODO(youjiali1995): It's better to use different maxSleep for different operations 1181 // and distinguish permanent errors from temporary errors, for example: 1182 // - If all PDs are down, all requests to PD will fail due to network error. 1183 // The maxSleep should't be very long in this case. 1184 // - If the region isn't found in PD, it's possible the reason is write-stall. 1185 // The maxSleep can be long in this case. 1186 bo := retry.NewBackofferWithVars(ctx, PrewriteMaxBackoff, c.txn.vars) 1187 1188 // If we want to use async commit or 1PC and also want linearizability across 1189 // all nodes, we have to make sure the commit TS of this transaction is greater 1190 // than the snapshot TS of all existent readers. So we get a new timestamp 1191 // from PD and plus one as our MinCommitTS. 1192 if commitTSMayBeCalculated && c.needLinearizability() { 1193 util.EvalFailpoint("getMinCommitTSFromTSO") 1194 latestTS, err := c.store.GetTimestampWithRetry(bo, c.txn.GetScope()) 1195 // If we fail to get a timestamp from PD, we just propagate the failure 1196 // instead of falling back to the normal 2PC because a normal 2PC will 1197 // also be likely to fail due to the same timestamp issue. 1198 if err != nil { 1199 return errors.Trace(err) 1200 } 1201 // Plus 1 to avoid producing the same commit TS with previously committed transactions 1202 c.minCommitTS = latestTS + 1 1203 } 1204 // Calculate maxCommitTS if necessary 1205 if commitTSMayBeCalculated { 1206 if err = c.calculateMaxCommitTS(ctx); err != nil { 1207 return errors.Trace(err) 1208 } 1209 } 1210 1211 if c.sessionID > 0 { 1212 util.EvalFailpoint("beforePrewrite") 1213 } 1214 1215 c.prewriteStarted = true 1216 var binlogChan <-chan BinlogWriteResult 1217 if c.shouldWriteBinlog() { 1218 binlogChan = c.binlog.Prewrite(ctx, c.primary()) 1219 } 1220 1221 start := time.Now() 1222 err = c.prewriteMutations(bo, c.mutations) 1223 1224 if err != nil { 1225 // TODO: Now we return an undetermined error as long as one of the prewrite 1226 // RPCs fails. However, if there are multiple errors and some of the errors 1227 // are not RPC failures, we can return the actual error instead of undetermined. 1228 if undeterminedErr := c.getUndeterminedErr(); undeterminedErr != nil { 1229 logutil.Logger(ctx).Error("2PC commit result undetermined", 1230 zap.Error(err), 1231 zap.NamedError("rpcErr", undeterminedErr), 1232 zap.Uint64("txnStartTS", c.startTS)) 1233 return errors.Trace(terror.ErrResultUndetermined) 1234 } 1235 } 1236 1237 commitDetail := c.getDetail() 1238 commitDetail.PrewriteTime = time.Since(start) 1239 if bo.GetTotalSleep() > 0 { 1240 boSleep := int64(bo.GetTotalSleep()) * int64(time.Millisecond) 1241 commitDetail.Mu.Lock() 1242 if boSleep > commitDetail.Mu.CommitBackoffTime { 1243 commitDetail.Mu.CommitBackoffTime = boSleep 1244 commitDetail.Mu.BackoffTypes = bo.GetTypes() 1245 } 1246 commitDetail.Mu.Unlock() 1247 } 1248 1249 if binlogChan != nil { 1250 startWaitBinlog := time.Now() 1251 binlogWriteResult := <-binlogChan 1252 commitDetail.WaitPrewriteBinlogTime = time.Since(startWaitBinlog) 1253 if binlogWriteResult != nil { 1254 binlogSkipped = binlogWriteResult.Skipped() 1255 binlogErr := binlogWriteResult.GetError() 1256 if binlogErr != nil { 1257 return binlogErr 1258 } 1259 } 1260 } 1261 if err != nil { 1262 logutil.Logger(ctx).Debug("2PC failed on prewrite", 1263 zap.Error(err), 1264 zap.Uint64("txnStartTS", c.startTS)) 1265 return errors.Trace(err) 1266 } 1267 1268 // strip check_not_exists keys that no need to commit. 1269 c.stripNoNeedCommitKeys() 1270 1271 var commitTS uint64 1272 1273 if c.isOnePC() { 1274 if c.onePCCommitTS == 0 { 1275 err = errors.Errorf("session %d invalid onePCCommitTS for 1PC protocol after prewrite, startTS=%v", c.sessionID, c.startTS) 1276 return errors.Trace(err) 1277 } 1278 c.commitTS = c.onePCCommitTS 1279 c.txn.commitTS = c.commitTS 1280 logutil.Logger(ctx).Debug("1PC protocol is used to commit this txn", 1281 zap.Uint64("startTS", c.startTS), zap.Uint64("commitTS", c.commitTS), 1282 zap.Uint64("session", c.sessionID)) 1283 return nil 1284 } 1285 1286 if c.onePCCommitTS != 0 { 1287 logutil.Logger(ctx).Fatal("non 1PC transaction committed in 1PC", 1288 zap.Uint64("session", c.sessionID), zap.Uint64("startTS", c.startTS)) 1289 } 1290 1291 if c.isAsyncCommit() { 1292 if c.minCommitTS == 0 { 1293 err = errors.Errorf("session %d invalid minCommitTS for async commit protocol after prewrite, startTS=%v", c.sessionID, c.startTS) 1294 return errors.Trace(err) 1295 } 1296 commitTS = c.minCommitTS 1297 } else { 1298 start = time.Now() 1299 logutil.Event(ctx, "start get commit ts") 1300 commitTS, err = c.store.GetTimestampWithRetry(retry.NewBackofferWithVars(ctx, TsoMaxBackoff, c.txn.vars), c.txn.GetScope()) 1301 if err != nil { 1302 logutil.Logger(ctx).Warn("2PC get commitTS failed", 1303 zap.Error(err), 1304 zap.Uint64("txnStartTS", c.startTS)) 1305 return errors.Trace(err) 1306 } 1307 commitDetail.GetCommitTsTime = time.Since(start) 1308 logutil.Event(ctx, "finish get commit ts") 1309 logutil.SetTag(ctx, "commitTs", commitTS) 1310 } 1311 1312 if !c.isAsyncCommit() { 1313 tryAmend := c.isPessimistic && c.sessionID > 0 && c.txn.schemaAmender != nil 1314 if !tryAmend { 1315 _, _, err = c.checkSchemaValid(ctx, commitTS, c.txn.schemaVer, false) 1316 if err != nil { 1317 return errors.Trace(err) 1318 } 1319 } else { 1320 relatedSchemaChange, memAmended, err := c.checkSchemaValid(ctx, commitTS, c.txn.schemaVer, true) 1321 if err != nil { 1322 return errors.Trace(err) 1323 } 1324 if memAmended { 1325 // Get new commitTS and check schema valid again. 1326 newCommitTS, err := c.getCommitTS(ctx, commitDetail) 1327 if err != nil { 1328 return errors.Trace(err) 1329 } 1330 // If schema check failed between commitTS and newCommitTs, report schema change error. 1331 _, _, err = c.checkSchemaValid(ctx, newCommitTS, relatedSchemaChange.LatestInfoSchema, false) 1332 if err != nil { 1333 logutil.Logger(ctx).Info("schema check after amend failed, it means the schema version changed again", 1334 zap.Uint64("startTS", c.startTS), 1335 zap.Uint64("amendTS", commitTS), 1336 zap.Int64("amendedSchemaVersion", relatedSchemaChange.LatestInfoSchema.SchemaMetaVersion()), 1337 zap.Uint64("newCommitTS", newCommitTS)) 1338 return errors.Trace(err) 1339 } 1340 commitTS = newCommitTS 1341 } 1342 } 1343 } 1344 atomic.StoreUint64(&c.commitTS, commitTS) 1345 1346 if c.store.GetOracle().IsExpired(c.startTS, MaxTxnTimeUse, &oracle.Option{TxnScope: oracle.GlobalTxnScope}) { 1347 err = errors.Errorf("session %d txn takes too much time, txnStartTS: %d, comm: %d", 1348 c.sessionID, c.startTS, c.commitTS) 1349 return err 1350 } 1351 1352 if c.sessionID > 0 { 1353 if val, err := util.EvalFailpoint("beforeCommit"); err == nil { 1354 // Pass multiple instructions in one string, delimited by commas, to trigger multiple behaviors, like 1355 // `return("delay,fail")`. Then they will be executed sequentially at once. 1356 if v, ok := val.(string); ok { 1357 for _, action := range strings.Split(v, ",") { 1358 // Async commit transactions cannot return error here, since it's already successful. 1359 if action == "fail" && !c.isAsyncCommit() { 1360 logutil.Logger(ctx).Info("[failpoint] injected failure before commit", zap.Uint64("txnStartTS", c.startTS)) 1361 return errors.New("injected failure before commit") 1362 } else if action == "delay" { 1363 duration := time.Duration(rand.Int63n(int64(time.Second) * 5)) 1364 logutil.Logger(ctx).Info("[failpoint] injected delay before commit", 1365 zap.Uint64("txnStartTS", c.startTS), zap.Duration("duration", duration)) 1366 time.Sleep(duration) 1367 } 1368 } 1369 } 1370 } 1371 } 1372 1373 if c.isAsyncCommit() { 1374 // For async commit protocol, the commit is considered success here. 1375 c.txn.commitTS = c.commitTS 1376 logutil.Logger(ctx).Debug("2PC will use async commit protocol to commit this txn", 1377 zap.Uint64("startTS", c.startTS), zap.Uint64("commitTS", c.commitTS), 1378 zap.Uint64("sessionID", c.sessionID)) 1379 c.store.WaitGroup().Add(1) 1380 go func() { 1381 defer c.store.WaitGroup().Done() 1382 if _, err := util.EvalFailpoint("asyncCommitDoNothing"); err == nil { 1383 return 1384 } 1385 commitBo := retry.NewBackofferWithVars(c.store.Ctx(), CommitSecondaryMaxBackoff, c.txn.vars) 1386 err := c.commitMutations(commitBo, c.mutations) 1387 if err != nil { 1388 logutil.Logger(ctx).Warn("2PC async commit failed", zap.Uint64("sessionID", c.sessionID), 1389 zap.Uint64("startTS", c.startTS), zap.Uint64("commitTS", c.commitTS), zap.Error(err)) 1390 } 1391 }() 1392 return nil 1393 } 1394 return c.commitTxn(ctx, commitDetail) 1395 } 1396 1397 func (c *twoPhaseCommitter) commitTxn(ctx context.Context, commitDetail *util.CommitDetails) error { 1398 c.txn.GetMemBuffer().DiscardValues() 1399 start := time.Now() 1400 1401 // Use the VeryLongMaxBackoff to commit the primary key. 1402 commitBo := retry.NewBackofferWithVars(ctx, int(CommitMaxBackoff), c.txn.vars) 1403 err := c.commitMutations(commitBo, c.mutations) 1404 commitDetail.CommitTime = time.Since(start) 1405 if commitBo.GetTotalSleep() > 0 { 1406 commitDetail.Mu.Lock() 1407 commitDetail.Mu.CommitBackoffTime += int64(commitBo.GetTotalSleep()) * int64(time.Millisecond) 1408 commitDetail.Mu.BackoffTypes = append(commitDetail.Mu.BackoffTypes, commitBo.GetTypes()...) 1409 commitDetail.Mu.Unlock() 1410 } 1411 if err != nil { 1412 if undeterminedErr := c.getUndeterminedErr(); undeterminedErr != nil { 1413 logutil.Logger(ctx).Error("2PC commit result undetermined", 1414 zap.Error(err), 1415 zap.NamedError("rpcErr", undeterminedErr), 1416 zap.Uint64("txnStartTS", c.startTS)) 1417 err = errors.Trace(terror.ErrResultUndetermined) 1418 } 1419 if !c.mu.committed { 1420 logutil.Logger(ctx).Debug("2PC failed on commit", 1421 zap.Error(err), 1422 zap.Uint64("txnStartTS", c.startTS)) 1423 return errors.Trace(err) 1424 } 1425 logutil.Logger(ctx).Debug("got some exceptions, but 2PC was still successful", 1426 zap.Error(err), 1427 zap.Uint64("txnStartTS", c.startTS)) 1428 } 1429 return nil 1430 } 1431 1432 func (c *twoPhaseCommitter) stripNoNeedCommitKeys() { 1433 if !c.hasNoNeedCommitKeys { 1434 return 1435 } 1436 m := c.mutations 1437 var newIdx int 1438 for oldIdx := range m.handles { 1439 key := m.GetKey(oldIdx) 1440 flags, err := c.txn.GetMemBuffer().GetFlags(key) 1441 if err == nil && flags.HasPrewriteOnly() { 1442 continue 1443 } 1444 m.handles[newIdx] = m.handles[oldIdx] 1445 newIdx++ 1446 } 1447 c.mutations.handles = c.mutations.handles[:newIdx] 1448 } 1449 1450 // SchemaVer is the infoSchema which will return the schema version. 1451 type SchemaVer interface { 1452 // SchemaMetaVersion returns the meta schema version. 1453 SchemaMetaVersion() int64 1454 } 1455 1456 // SchemaLeaseChecker is used to validate schema version is not changed during transaction execution. 1457 type SchemaLeaseChecker interface { 1458 // CheckBySchemaVer checks if the schema has changed for the transaction related tables between the startSchemaVer 1459 // and the schema version at txnTS, all the related schema changes will be returned. 1460 CheckBySchemaVer(txnTS uint64, startSchemaVer SchemaVer) (*RelatedSchemaChange, error) 1461 } 1462 1463 // RelatedSchemaChange contains information about schema diff between two schema versions. 1464 type RelatedSchemaChange struct { 1465 PhyTblIDS []int64 1466 ActionTypes []uint64 1467 LatestInfoSchema SchemaVer 1468 Amendable bool 1469 } 1470 1471 func (c *twoPhaseCommitter) amendPessimisticLock(ctx context.Context, addMutations CommitterMutations) error { 1472 keysNeedToLock := NewPlainMutations(addMutations.Len()) 1473 for i := 0; i < addMutations.Len(); i++ { 1474 if addMutations.IsPessimisticLock(i) { 1475 keysNeedToLock.Push(addMutations.GetOp(i), addMutations.GetKey(i), addMutations.GetValue(i), addMutations.IsPessimisticLock(i)) 1476 } 1477 } 1478 // For unique index amend, we need to pessimistic lock the generated new index keys first. 1479 // Set doingAmend to true to force the pessimistic lock do the exist check for these keys. 1480 c.doingAmend = true 1481 defer func() { c.doingAmend = false }() 1482 if keysNeedToLock.Len() > 0 { 1483 lCtx := kv.NewLockCtx(c.forUpdateTS, c.lockCtx.LockWaitTime(), time.Now()) 1484 lCtx.Killed = c.lockCtx.Killed 1485 tryTimes := uint(0) 1486 retryLimit := config.GetGlobalConfig().PessimisticTxn.MaxRetryCount 1487 var err error 1488 for tryTimes < retryLimit { 1489 pessimisticLockBo := retry.NewBackofferWithVars(ctx, pessimisticLockMaxBackoff, c.txn.vars) 1490 err = c.pessimisticLockMutations(pessimisticLockBo, lCtx, &keysNeedToLock) 1491 if err != nil { 1492 // KeysNeedToLock won't change, so don't async rollback pessimistic locks here for write conflict. 1493 if _, ok := errors.Cause(err).(*tikverr.ErrWriteConflict); ok { 1494 newForUpdateTSVer, err := c.store.CurrentTimestamp(oracle.GlobalTxnScope) 1495 if err != nil { 1496 return errors.Trace(err) 1497 } 1498 lCtx.ForUpdateTS = newForUpdateTSVer 1499 c.forUpdateTS = newForUpdateTSVer 1500 logutil.Logger(ctx).Info("amend pessimistic lock pessimistic retry lock", 1501 zap.Uint("tryTimes", tryTimes), zap.Uint64("startTS", c.startTS), 1502 zap.Uint64("newForUpdateTS", c.forUpdateTS)) 1503 tryTimes++ 1504 continue 1505 } 1506 logutil.Logger(ctx).Warn("amend pessimistic lock has failed", zap.Error(err), zap.Uint64("txnStartTS", c.startTS)) 1507 return err 1508 } 1509 logutil.Logger(ctx).Info("amend pessimistic lock finished", zap.Uint64("startTS", c.startTS), 1510 zap.Uint64("forUpdateTS", c.forUpdateTS), zap.Int("keys", keysNeedToLock.Len())) 1511 break 1512 } 1513 if err != nil { 1514 logutil.Logger(ctx).Warn("amend pessimistic lock failed after retry", 1515 zap.Uint("tryTimes", tryTimes), zap.Uint64("startTS", c.startTS)) 1516 return err 1517 } 1518 } 1519 return nil 1520 } 1521 1522 func (c *twoPhaseCommitter) tryAmendTxn(ctx context.Context, startInfoSchema SchemaVer, change *RelatedSchemaChange) (bool, error) { 1523 addMutations, err := c.txn.schemaAmender.AmendTxn(ctx, startInfoSchema, change, c.mutations) 1524 if err != nil { 1525 return false, err 1526 } 1527 // Add new mutations to the mutation list or prewrite them if prewrite already starts. 1528 if addMutations != nil && addMutations.Len() > 0 { 1529 err = c.amendPessimisticLock(ctx, addMutations) 1530 if err != nil { 1531 logutil.Logger(ctx).Info("amendPessimisticLock has failed", zap.Error(err)) 1532 return false, err 1533 } 1534 if c.prewriteStarted { 1535 prewriteBo := retry.NewBackofferWithVars(ctx, PrewriteMaxBackoff, c.txn.vars) 1536 err = c.prewriteMutations(prewriteBo, addMutations) 1537 if err != nil { 1538 logutil.Logger(ctx).Warn("amend prewrite has failed", zap.Error(err), zap.Uint64("txnStartTS", c.startTS)) 1539 return false, err 1540 } 1541 logutil.Logger(ctx).Info("amend prewrite finished", zap.Uint64("txnStartTS", c.startTS)) 1542 return true, nil 1543 } 1544 memBuf := c.txn.GetMemBuffer() 1545 for i := 0; i < addMutations.Len(); i++ { 1546 key := addMutations.GetKey(i) 1547 op := addMutations.GetOp(i) 1548 var err error 1549 if op == kvrpcpb.Op_Del { 1550 err = memBuf.Delete(key) 1551 } else { 1552 err = memBuf.Set(key, addMutations.GetValue(i)) 1553 } 1554 if err != nil { 1555 logutil.Logger(ctx).Warn("amend mutations has failed", zap.Error(err), zap.Uint64("txnStartTS", c.startTS)) 1556 return false, err 1557 } 1558 handle := c.txn.GetMemBuffer().IterWithFlags(key, nil).Handle() 1559 c.mutations.Push(op, addMutations.IsPessimisticLock(i), handle) 1560 } 1561 } 1562 return false, nil 1563 } 1564 1565 func (c *twoPhaseCommitter) getCommitTS(ctx context.Context, commitDetail *util.CommitDetails) (uint64, error) { 1566 start := time.Now() 1567 logutil.Event(ctx, "start get commit ts") 1568 commitTS, err := c.store.GetTimestampWithRetry(retry.NewBackofferWithVars(ctx, TsoMaxBackoff, c.txn.vars), c.txn.GetScope()) 1569 if err != nil { 1570 logutil.Logger(ctx).Warn("2PC get commitTS failed", 1571 zap.Error(err), 1572 zap.Uint64("txnStartTS", c.startTS)) 1573 return 0, errors.Trace(err) 1574 } 1575 commitDetail.GetCommitTsTime = time.Since(start) 1576 logutil.Event(ctx, "finish get commit ts") 1577 logutil.SetTag(ctx, "commitTS", commitTS) 1578 1579 // Check commitTS. 1580 if commitTS <= c.startTS { 1581 err = errors.Errorf("session %d invalid transaction tso with txnStartTS=%v while txnCommitTS=%v", 1582 c.sessionID, c.startTS, commitTS) 1583 logutil.BgLogger().Error("invalid transaction", zap.Error(err)) 1584 return 0, errors.Trace(err) 1585 } 1586 return commitTS, nil 1587 } 1588 1589 // checkSchemaValid checks if the schema has changed, if tryAmend is set to true, committer will try to amend 1590 // this transaction using the related schema changes. 1591 func (c *twoPhaseCommitter) checkSchemaValid(ctx context.Context, checkTS uint64, startInfoSchema SchemaVer, 1592 tryAmend bool) (*RelatedSchemaChange, bool, error) { 1593 if _, err := util.EvalFailpoint("failCheckSchemaValid"); err == nil { 1594 logutil.Logger(ctx).Info("[failpoint] injected fail schema check", 1595 zap.Uint64("txnStartTS", c.startTS)) 1596 err := errors.Errorf("mock check schema valid failure") 1597 return nil, false, err 1598 } 1599 if c.txn.schemaLeaseChecker == nil { 1600 if c.sessionID > 0 { 1601 logutil.Logger(ctx).Warn("schemaLeaseChecker is not set for this transaction", 1602 zap.Uint64("sessionID", c.sessionID), 1603 zap.Uint64("startTS", c.startTS), 1604 zap.Uint64("commitTS", checkTS)) 1605 } 1606 return nil, false, nil 1607 } 1608 relatedChanges, err := c.txn.schemaLeaseChecker.CheckBySchemaVer(checkTS, startInfoSchema) 1609 if err != nil { 1610 if tryAmend && relatedChanges != nil && relatedChanges.Amendable && c.txn.schemaAmender != nil { 1611 memAmended, amendErr := c.tryAmendTxn(ctx, startInfoSchema, relatedChanges) 1612 if amendErr != nil { 1613 logutil.BgLogger().Info("txn amend has failed", zap.Uint64("sessionID", c.sessionID), 1614 zap.Uint64("startTS", c.startTS), zap.Error(amendErr)) 1615 return nil, false, err 1616 } 1617 logutil.Logger(ctx).Info("amend txn successfully", 1618 zap.Uint64("sessionID", c.sessionID), zap.Uint64("txn startTS", c.startTS), zap.Bool("memAmended", memAmended), 1619 zap.Uint64("checkTS", checkTS), zap.Int64("startInfoSchemaVer", startInfoSchema.SchemaMetaVersion()), 1620 zap.Int64s("table ids", relatedChanges.PhyTblIDS), zap.Uint64s("action types", relatedChanges.ActionTypes)) 1621 return relatedChanges, memAmended, nil 1622 } 1623 return nil, false, errors.Trace(err) 1624 } 1625 return nil, false, nil 1626 } 1627 1628 func (c *twoPhaseCommitter) calculateMaxCommitTS(ctx context.Context) error { 1629 // Amend txn with current time first, then we can make sure we have another SafeWindow time to commit 1630 currentTS := oracle.ComposeTS(int64(time.Since(c.txn.startTime)/time.Millisecond), 0) + c.startTS 1631 _, _, err := c.checkSchemaValid(ctx, currentTS, c.txn.schemaVer, true) 1632 if err != nil { 1633 logutil.Logger(ctx).Info("Schema changed for async commit txn", 1634 zap.Error(err), 1635 zap.Uint64("startTS", c.startTS)) 1636 return errors.Trace(err) 1637 } 1638 1639 safeWindow := config.GetGlobalConfig().TiKVClient.AsyncCommit.SafeWindow 1640 maxCommitTS := oracle.ComposeTS(int64(safeWindow/time.Millisecond), 0) + currentTS 1641 logutil.BgLogger().Debug("calculate MaxCommitTS", 1642 zap.Time("startTime", c.txn.startTime), 1643 zap.Duration("safeWindow", safeWindow), 1644 zap.Uint64("startTS", c.startTS), 1645 zap.Uint64("maxCommitTS", maxCommitTS)) 1646 1647 c.maxCommitTS = maxCommitTS 1648 return nil 1649 } 1650 1651 func (c *twoPhaseCommitter) shouldWriteBinlog() bool { 1652 return c.binlog != nil 1653 } 1654 1655 // TiKV recommends each RPC packet should be less than ~1MB. We keep each packet's 1656 // Key+Value size below 16KB. 1657 const txnCommitBatchSize = 16 * 1024 1658 1659 type batchMutations struct { 1660 region locate.RegionVerID 1661 mutations CommitterMutations 1662 isPrimary bool 1663 } 1664 1665 func (b *batchMutations) relocate(bo *retry.Backoffer, c *locate.RegionCache) (bool, error) { 1666 begin, end := b.mutations.GetKey(0), b.mutations.GetKey(b.mutations.Len()-1) 1667 loc, err := c.LocateKey(bo, begin) 1668 if err != nil { 1669 return false, errors.Trace(err) 1670 } 1671 if !loc.Contains(end) { 1672 return false, nil 1673 } 1674 b.region = loc.Region 1675 return true, nil 1676 } 1677 1678 type batched struct { 1679 batches []batchMutations 1680 primaryIdx int 1681 primaryKey []byte 1682 } 1683 1684 func newBatched(primaryKey []byte) *batched { 1685 return &batched{ 1686 primaryIdx: -1, 1687 primaryKey: primaryKey, 1688 } 1689 } 1690 1691 // appendBatchMutationsBySize appends mutations to b. It may split the keys to make 1692 // sure each batch's size does not exceed the limit. 1693 func (b *batched) appendBatchMutationsBySize(region locate.RegionVerID, mutations CommitterMutations, sizeFn func(k, v []byte) int, limit int) { 1694 if _, err := util.EvalFailpoint("twoPCRequestBatchSizeLimit"); err == nil { 1695 limit = 1 1696 } 1697 1698 var start, end int 1699 for start = 0; start < mutations.Len(); start = end { 1700 var size int 1701 for end = start; end < mutations.Len() && size < limit; end++ { 1702 var k, v []byte 1703 k = mutations.GetKey(end) 1704 v = mutations.GetValue(end) 1705 size += sizeFn(k, v) 1706 if b.primaryIdx < 0 && bytes.Equal(k, b.primaryKey) { 1707 b.primaryIdx = len(b.batches) 1708 } 1709 } 1710 b.batches = append(b.batches, batchMutations{ 1711 region: region, 1712 mutations: mutations.Slice(start, end), 1713 }) 1714 } 1715 } 1716 1717 func (b *batched) setPrimary() bool { 1718 // If the batches include the primary key, put it to the first 1719 if b.primaryIdx >= 0 { 1720 if len(b.batches) > 0 { 1721 b.batches[b.primaryIdx].isPrimary = true 1722 b.batches[0], b.batches[b.primaryIdx] = b.batches[b.primaryIdx], b.batches[0] 1723 b.primaryIdx = 0 1724 } 1725 return true 1726 } 1727 1728 return false 1729 } 1730 1731 func (b *batched) allBatches() []batchMutations { 1732 return b.batches 1733 } 1734 1735 // primaryBatch returns the batch containing the primary key. 1736 // Precondition: `b.setPrimary() == true` 1737 func (b *batched) primaryBatch() []batchMutations { 1738 return b.batches[:1] 1739 } 1740 1741 func (b *batched) forgetPrimary() { 1742 if len(b.batches) == 0 { 1743 return 1744 } 1745 b.batches = b.batches[1:] 1746 } 1747 1748 // batchExecutor is txn controller providing rate control like utils 1749 type batchExecutor struct { 1750 rateLim int // concurrent worker numbers 1751 rateLimiter *util.RateLimit // rate limiter for concurrency control, maybe more strategies 1752 committer *twoPhaseCommitter // here maybe more different type committer in the future 1753 action twoPhaseCommitAction // the work action type 1754 backoffer *retry.Backoffer // Backoffer 1755 tokenWaitDuration time.Duration // get token wait time 1756 } 1757 1758 // newBatchExecutor create processor to handle concurrent batch works(prewrite/commit etc) 1759 func newBatchExecutor(rateLimit int, committer *twoPhaseCommitter, 1760 action twoPhaseCommitAction, backoffer *retry.Backoffer) *batchExecutor { 1761 return &batchExecutor{rateLimit, nil, committer, 1762 action, backoffer, 0} 1763 } 1764 1765 // initUtils do initialize batchExecutor related policies like rateLimit util 1766 func (batchExe *batchExecutor) initUtils() error { 1767 // init rateLimiter by injected rate limit number 1768 batchExe.rateLimiter = util.NewRateLimit(batchExe.rateLim) 1769 return nil 1770 } 1771 1772 // startWork concurrently do the work for each batch considering rate limit 1773 func (batchExe *batchExecutor) startWorker(exitCh chan struct{}, ch chan error, batches []batchMutations) { 1774 for idx, batch1 := range batches { 1775 waitStart := time.Now() 1776 if exit := batchExe.rateLimiter.GetToken(exitCh); !exit { 1777 batchExe.tokenWaitDuration += time.Since(waitStart) 1778 batch := batch1 1779 go func() { 1780 defer batchExe.rateLimiter.PutToken() 1781 var singleBatchBackoffer *retry.Backoffer 1782 if _, ok := batchExe.action.(actionCommit); ok { 1783 // Because the secondary batches of the commit actions are implemented to be 1784 // committed asynchronously in background goroutines, we should not 1785 // fork a child context and call cancel() while the foreground goroutine exits. 1786 // Otherwise the background goroutines will be canceled execeptionally. 1787 // Here we makes a new clone of the original backoffer for this goroutine 1788 // exclusively to avoid the data race when using the same backoffer 1789 // in concurrent goroutines. 1790 singleBatchBackoffer = batchExe.backoffer.Clone() 1791 } else { 1792 var singleBatchCancel context.CancelFunc 1793 singleBatchBackoffer, singleBatchCancel = batchExe.backoffer.Fork() 1794 defer singleBatchCancel() 1795 } 1796 ch <- batchExe.action.handleSingleBatch(batchExe.committer, singleBatchBackoffer, batch) 1797 commitDetail := batchExe.committer.getDetail() 1798 // For prewrite, we record the max backoff time 1799 if _, ok := batchExe.action.(actionPrewrite); ok { 1800 commitDetail.Mu.Lock() 1801 boSleep := int64(singleBatchBackoffer.GetTotalSleep()) * int64(time.Millisecond) 1802 if boSleep > commitDetail.Mu.CommitBackoffTime { 1803 commitDetail.Mu.CommitBackoffTime = boSleep 1804 commitDetail.Mu.BackoffTypes = singleBatchBackoffer.GetTypes() 1805 } 1806 commitDetail.Mu.Unlock() 1807 } 1808 // Backoff time in the 2nd phase of a non-async-commit txn is added 1809 // in the commitTxn method, so we don't add it here. 1810 }() 1811 } else { 1812 logutil.Logger(batchExe.backoffer.GetCtx()).Info("break startWorker", 1813 zap.Stringer("action", batchExe.action), zap.Int("batch size", len(batches)), 1814 zap.Int("index", idx)) 1815 break 1816 } 1817 } 1818 } 1819 1820 // process will start worker routine and collect results 1821 func (batchExe *batchExecutor) process(batches []batchMutations) error { 1822 var err error 1823 err = batchExe.initUtils() 1824 if err != nil { 1825 logutil.Logger(batchExe.backoffer.GetCtx()).Error("batchExecutor initUtils failed", zap.Error(err)) 1826 return err 1827 } 1828 1829 // For prewrite, stop sending other requests after receiving first error. 1830 var cancel context.CancelFunc 1831 if _, ok := batchExe.action.(actionPrewrite); ok { 1832 batchExe.backoffer, cancel = batchExe.backoffer.Fork() 1833 defer cancel() 1834 } 1835 // concurrently do the work for each batch. 1836 ch := make(chan error, len(batches)) 1837 exitCh := make(chan struct{}) 1838 go batchExe.startWorker(exitCh, ch, batches) 1839 // check results 1840 for i := 0; i < len(batches); i++ { 1841 if e := <-ch; e != nil { 1842 logutil.Logger(batchExe.backoffer.GetCtx()).Debug("2PC doActionOnBatch failed", 1843 zap.Uint64("session", batchExe.committer.sessionID), 1844 zap.Stringer("action type", batchExe.action), 1845 zap.Error(e), 1846 zap.Uint64("txnStartTS", batchExe.committer.startTS)) 1847 // Cancel other requests and return the first error. 1848 if cancel != nil { 1849 logutil.Logger(batchExe.backoffer.GetCtx()).Debug("2PC doActionOnBatch to cancel other actions", 1850 zap.Uint64("session", batchExe.committer.sessionID), 1851 zap.Stringer("action type", batchExe.action), 1852 zap.Uint64("txnStartTS", batchExe.committer.startTS)) 1853 atomic.StoreUint32(&batchExe.committer.prewriteCancelled, 1) 1854 cancel() 1855 } 1856 if err == nil { 1857 err = e 1858 } 1859 } 1860 } 1861 close(exitCh) 1862 if batchExe.tokenWaitDuration > 0 { 1863 metrics.TiKVTokenWaitDuration.Observe(float64(batchExe.tokenWaitDuration.Nanoseconds())) 1864 } 1865 return err 1866 } 1867 1868 func (c *twoPhaseCommitter) setDetail(d *util.CommitDetails) { 1869 atomic.StorePointer(&c.detail, unsafe.Pointer(d)) 1870 } 1871 1872 func (c *twoPhaseCommitter) getDetail() *util.CommitDetails { 1873 return (*util.CommitDetails)(atomic.LoadPointer(&c.detail)) 1874 } 1875 1876 func (c *twoPhaseCommitter) setUndeterminedErr(err error) { 1877 c.mu.Lock() 1878 defer c.mu.Unlock() 1879 c.mu.undeterminedErr = err 1880 } 1881 1882 func (c *twoPhaseCommitter) getUndeterminedErr() error { 1883 c.mu.RLock() 1884 defer c.mu.RUnlock() 1885 return c.mu.undeterminedErr 1886 } 1887 1888 func (c *twoPhaseCommitter) mutationsOfKeys(keys [][]byte) CommitterMutations { 1889 var res PlainMutations 1890 for i := 0; i < c.mutations.Len(); i++ { 1891 for _, key := range keys { 1892 if bytes.Equal(c.mutations.GetKey(i), key) { 1893 res.Push(c.mutations.GetOp(i), c.mutations.GetKey(i), c.mutations.GetValue(i), c.mutations.IsPessimisticLock(i)) 1894 break 1895 } 1896 } 1897 } 1898 return &res 1899 }