github.com/KinWaiYuen/client-go/v2@v2.5.4/txnkv/txnlock/lock_resolver.go (about) 1 // Copyright 2021 TiKV Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package txnlock 16 17 import ( 18 "bytes" 19 "container/list" 20 "context" 21 "encoding/hex" 22 "fmt" 23 "math" 24 "sync" 25 "time" 26 27 "github.com/KinWaiYuen/client-go/v2/config" 28 tikverr "github.com/KinWaiYuen/client-go/v2/error" 29 "github.com/KinWaiYuen/client-go/v2/internal/client" 30 "github.com/KinWaiYuen/client-go/v2/internal/locate" 31 "github.com/KinWaiYuen/client-go/v2/internal/logutil" 32 "github.com/KinWaiYuen/client-go/v2/internal/retry" 33 "github.com/KinWaiYuen/client-go/v2/metrics" 34 "github.com/KinWaiYuen/client-go/v2/oracle" 35 "github.com/KinWaiYuen/client-go/v2/tikvrpc" 36 "github.com/KinWaiYuen/client-go/v2/util" 37 "github.com/pingcap/errors" 38 "github.com/pingcap/kvproto/pkg/kvrpcpb" 39 "go.uber.org/zap" 40 ) 41 42 // ResolvedCacheSize is max number of cached txn status. 43 const ResolvedCacheSize = 2048 44 45 type storage interface { 46 // GetRegionCache gets the RegionCache. 47 GetRegionCache() *locate.RegionCache 48 // SendReq sends a request to TiKV. 49 SendReq(bo *retry.Backoffer, req *tikvrpc.Request, regionID locate.RegionVerID, timeout time.Duration) (*tikvrpc.Response, error) 50 // GetOracle gets a timestamp oracle client. 51 GetOracle() oracle.Oracle 52 } 53 54 // LockResolver resolves locks and also caches resolved txn status. 55 type LockResolver struct { 56 store storage 57 resolveLockLiteThreshold uint64 58 mu struct { 59 sync.RWMutex 60 // resolved caches resolved txns (FIFO, txn id -> txnStatus). 61 resolved map[uint64]TxnStatus 62 recentResolved *list.List 63 } 64 testingKnobs struct { 65 meetLock func(locks []*Lock) 66 } 67 } 68 69 // NewLockResolver creates a new LockResolver instance. 70 func NewLockResolver(store storage) *LockResolver { 71 r := &LockResolver{ 72 store: store, 73 resolveLockLiteThreshold: config.GetGlobalConfig().TiKVClient.ResolveLockLiteThreshold, 74 } 75 r.mu.resolved = make(map[uint64]TxnStatus) 76 r.mu.recentResolved = list.New() 77 return r 78 } 79 80 // TxnStatus represents a txn's final status. It should be Lock or Commit or Rollback. 81 type TxnStatus struct { 82 ttl uint64 83 commitTS uint64 84 action kvrpcpb.Action 85 primaryLock *kvrpcpb.LockInfo 86 } 87 88 // IsCommitted returns true if the txn's final status is Commit. 89 func (s TxnStatus) IsCommitted() bool { return s.ttl == 0 && s.commitTS > 0 } 90 91 // CommitTS returns the txn's commitTS. It is valid iff `IsCommitted` is true. 92 func (s TxnStatus) CommitTS() uint64 { return s.commitTS } 93 94 // TTL returns the TTL of the transaction if the transaction is still alive. 95 func (s TxnStatus) TTL() uint64 { return s.ttl } 96 97 // Action returns what the CheckTxnStatus request have done to the transaction. 98 func (s TxnStatus) Action() kvrpcpb.Action { return s.action } 99 100 // StatusCacheable checks whether the transaction status is certain.True will be 101 // returned if its status is certain: 102 // If transaction is already committed, the result could be cached. 103 // Otherwise: 104 // If l.LockType is pessimistic lock type: 105 // - if its primary lock is pessimistic too, the check txn status result should not be cached. 106 // - if its primary lock is prewrite lock type, the check txn status could be cached. 107 // If l.lockType is prewrite lock type: 108 // - always cache the check txn status result. 109 // For prewrite locks, their primary keys should ALWAYS be the correct one and will NOT change. 110 func (s TxnStatus) StatusCacheable() bool { 111 if s.IsCommitted() { 112 return true 113 } 114 if s.ttl == 0 { 115 if s.action == kvrpcpb.Action_NoAction || 116 s.action == kvrpcpb.Action_LockNotExistRollback || 117 s.action == kvrpcpb.Action_TTLExpireRollback { 118 return true 119 } 120 } 121 return false 122 } 123 124 // Lock represents a lock from tikv server. 125 type Lock struct { 126 Key []byte 127 Primary []byte 128 TxnID uint64 129 TTL uint64 130 TxnSize uint64 131 LockType kvrpcpb.Op 132 UseAsyncCommit bool 133 LockForUpdateTS uint64 134 MinCommitTS uint64 135 } 136 137 func (l *Lock) String() string { 138 buf := bytes.NewBuffer(make([]byte, 0, 128)) 139 buf.WriteString("key: ") 140 buf.WriteString(hex.EncodeToString(l.Key)) 141 buf.WriteString(", primary: ") 142 buf.WriteString(hex.EncodeToString(l.Primary)) 143 return fmt.Sprintf("%s, txnStartTS: %d, lockForUpdateTS:%d, minCommitTs:%d, ttl: %d, type: %s, UseAsyncCommit: %t, txnSize: %d", 144 buf.String(), l.TxnID, l.LockForUpdateTS, l.MinCommitTS, l.TTL, l.LockType, l.UseAsyncCommit, l.TxnSize) 145 } 146 147 // NewLock creates a new *Lock. 148 func NewLock(l *kvrpcpb.LockInfo) *Lock { 149 return &Lock{ 150 Key: l.GetKey(), 151 Primary: l.GetPrimaryLock(), 152 TxnID: l.GetLockVersion(), 153 TTL: l.GetLockTtl(), 154 TxnSize: l.GetTxnSize(), 155 LockType: l.LockType, 156 UseAsyncCommit: l.UseAsyncCommit, 157 LockForUpdateTS: l.LockForUpdateTs, 158 MinCommitTS: l.MinCommitTs, 159 } 160 } 161 162 func (lr *LockResolver) saveResolved(txnID uint64, status TxnStatus) { 163 lr.mu.Lock() 164 defer lr.mu.Unlock() 165 166 if _, ok := lr.mu.resolved[txnID]; ok { 167 return 168 } 169 lr.mu.resolved[txnID] = status 170 lr.mu.recentResolved.PushBack(txnID) 171 if len(lr.mu.resolved) > ResolvedCacheSize { 172 front := lr.mu.recentResolved.Front() 173 delete(lr.mu.resolved, front.Value.(uint64)) 174 lr.mu.recentResolved.Remove(front) 175 } 176 } 177 178 func (lr *LockResolver) getResolved(txnID uint64) (TxnStatus, bool) { 179 lr.mu.RLock() 180 defer lr.mu.RUnlock() 181 182 s, ok := lr.mu.resolved[txnID] 183 return s, ok 184 } 185 186 // BatchResolveLocks resolve locks in a batch. 187 // Used it in gcworker only! 188 func (lr *LockResolver) BatchResolveLocks(bo *retry.Backoffer, locks []*Lock, loc locate.RegionVerID) (bool, error) { 189 if len(locks) == 0 { 190 return true, nil 191 } 192 193 metrics.LockResolverCountWithBatchResolve.Inc() 194 195 // The GCWorker kill all ongoing transactions, because it must make sure all 196 // locks have been cleaned before GC. 197 expiredLocks := locks 198 199 txnInfos := make(map[uint64]uint64) 200 startTime := time.Now() 201 for _, l := range expiredLocks { 202 if _, ok := txnInfos[l.TxnID]; ok { 203 continue 204 } 205 metrics.LockResolverCountWithExpired.Inc() 206 207 // Use currentTS = math.MaxUint64 means rollback the txn, no matter the lock is expired or not! 208 status, err := lr.getTxnStatus(bo, l.TxnID, l.Primary, 0, math.MaxUint64, true, false, l) 209 if err != nil { 210 return false, err 211 } 212 213 // If the transaction uses async commit, CheckTxnStatus will reject rolling back the primary lock. 214 // Then we need to check the secondary locks to determine the final status of the transaction. 215 if status.primaryLock != nil && status.primaryLock.UseAsyncCommit { 216 resolveData, err := lr.checkAllSecondaries(bo, l, &status) 217 if err == nil { 218 txnInfos[l.TxnID] = resolveData.commitTs 219 continue 220 } 221 if _, ok := errors.Cause(err).(*nonAsyncCommitLock); ok { 222 status, err = lr.getTxnStatus(bo, l.TxnID, l.Primary, 0, math.MaxUint64, true, true, l) 223 if err != nil { 224 return false, err 225 } 226 } else { 227 return false, err 228 } 229 } 230 231 if status.ttl > 0 { 232 logutil.BgLogger().Error("BatchResolveLocks fail to clean locks, this result is not expected!") 233 return false, errors.New("TiDB ask TiKV to rollback locks but it doesn't, the protocol maybe wrong") 234 } 235 236 txnInfos[l.TxnID] = status.commitTS 237 } 238 logutil.BgLogger().Info("BatchResolveLocks: lookup txn status", 239 zap.Duration("cost time", time.Since(startTime)), 240 zap.Int("num of txn", len(txnInfos))) 241 242 listTxnInfos := make([]*kvrpcpb.TxnInfo, 0, len(txnInfos)) 243 for txnID, status := range txnInfos { 244 listTxnInfos = append(listTxnInfos, &kvrpcpb.TxnInfo{ 245 Txn: txnID, 246 Status: status, 247 }) 248 } 249 250 req := tikvrpc.NewRequest(tikvrpc.CmdResolveLock, &kvrpcpb.ResolveLockRequest{TxnInfos: listTxnInfos}) 251 req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds()) 252 startTime = time.Now() 253 resp, err := lr.store.SendReq(bo, req, loc, client.ReadTimeoutShort) 254 if err != nil { 255 return false, errors.Trace(err) 256 } 257 258 regionErr, err := resp.GetRegionError() 259 if err != nil { 260 return false, errors.Trace(err) 261 } 262 263 if regionErr != nil { 264 err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String())) 265 if err != nil { 266 return false, errors.Trace(err) 267 } 268 return false, nil 269 } 270 271 if resp.Resp == nil { 272 return false, errors.Trace(tikverr.ErrBodyMissing) 273 } 274 cmdResp := resp.Resp.(*kvrpcpb.ResolveLockResponse) 275 if keyErr := cmdResp.GetError(); keyErr != nil { 276 return false, errors.Errorf("unexpected resolve err: %s", keyErr) 277 } 278 279 logutil.BgLogger().Info("BatchResolveLocks: resolve locks in a batch", 280 zap.Duration("cost time", time.Since(startTime)), 281 zap.Int("num of locks", len(expiredLocks))) 282 return true, nil 283 } 284 285 // ResolveLocks tries to resolve Locks. The resolving process is in 3 steps: 286 // 1) Use the `lockTTL` to pick up all expired locks. Only locks that are too 287 // old are considered orphan locks and will be handled later. If all locks 288 // are expired then all locks will be resolved so the returned `ok` will be 289 // true, otherwise caller should sleep a while before retry. 290 // 2) For each lock, query the primary key to get txn(which left the lock)'s 291 // commit status. 292 // 3) Send `ResolveLock` cmd to the lock's region to resolve all locks belong to 293 // the same transaction. 294 func (lr *LockResolver) ResolveLocks(bo *retry.Backoffer, callerStartTS uint64, locks []*Lock) (int64, []uint64 /*pushed*/, error) { 295 return lr.resolveLocks(bo, callerStartTS, locks, false, false) 296 } 297 298 // ResolveLocksLite resolves locks while preventing scan whole region. 299 func (lr *LockResolver) ResolveLocksLite(bo *retry.Backoffer, callerStartTS uint64, locks []*Lock) (int64, []uint64 /*pushed*/, error) { 300 return lr.resolveLocks(bo, callerStartTS, locks, false, true) 301 } 302 303 func (lr *LockResolver) resolveLocks(bo *retry.Backoffer, callerStartTS uint64, locks []*Lock, forWrite bool, lite bool) (int64, []uint64 /*pushed*/, error) { 304 if lr.testingKnobs.meetLock != nil { 305 lr.testingKnobs.meetLock(locks) 306 } 307 var msBeforeTxnExpired txnExpireTime 308 if len(locks) == 0 { 309 return msBeforeTxnExpired.value(), nil, nil 310 } 311 312 if forWrite { 313 metrics.LockResolverCountWithResolveForWrite.Inc() 314 } else { 315 metrics.LockResolverCountWithResolve.Inc() 316 } 317 318 var pushFail bool 319 // TxnID -> []Region, record resolved Regions. 320 // TODO: Maybe put it in LockResolver and share by all txns. 321 cleanTxns := make(map[uint64]map[locate.RegionVerID]struct{}) 322 var pushed []uint64 323 // pushed is only used in the read operation. 324 if !forWrite { 325 pushed = make([]uint64, 0, len(locks)) 326 } 327 328 var resolve func(*Lock, bool) error 329 resolve = func(l *Lock, forceSyncCommit bool) error { 330 status, err := lr.getTxnStatusFromLock(bo, l, callerStartTS, forceSyncCommit) 331 if err != nil { 332 return err 333 } 334 335 if status.ttl == 0 { 336 metrics.LockResolverCountWithExpired.Inc() 337 // If the lock is committed or rollbacked, resolve lock. 338 cleanRegions, exists := cleanTxns[l.TxnID] 339 if !exists { 340 cleanRegions = make(map[locate.RegionVerID]struct{}) 341 cleanTxns[l.TxnID] = cleanRegions 342 } 343 344 if status.primaryLock != nil && !forceSyncCommit && status.primaryLock.UseAsyncCommit && !exists { 345 err = lr.resolveLockAsync(bo, l, status) 346 if _, ok := errors.Cause(err).(*nonAsyncCommitLock); ok { 347 err = resolve(l, true) 348 } 349 } else if l.LockType == kvrpcpb.Op_PessimisticLock { 350 err = lr.resolvePessimisticLock(bo, l, cleanRegions) 351 } else { 352 err = lr.resolveLock(bo, l, status, lite, cleanRegions) 353 } 354 if err != nil { 355 return err 356 } 357 } else { 358 metrics.LockResolverCountWithNotExpired.Inc() 359 // If the lock is valid, the txn may be a pessimistic transaction. 360 // Update the txn expire time. 361 msBeforeLockExpired := lr.store.GetOracle().UntilExpired(l.TxnID, status.ttl, &oracle.Option{TxnScope: oracle.GlobalTxnScope}) 362 msBeforeTxnExpired.update(msBeforeLockExpired) 363 if forWrite { 364 // Write conflict detected! 365 // If it's a optimistic conflict and current txn is earlier than the lock owner, 366 // abort current transaction. 367 // This could avoids the deadlock scene of two large transaction. 368 if l.LockType != kvrpcpb.Op_PessimisticLock && l.TxnID > callerStartTS { 369 metrics.LockResolverCountWithWriteConflict.Inc() 370 return tikverr.NewErrWriteConfictWithArgs(callerStartTS, l.TxnID, status.commitTS, l.Key) 371 } 372 } else { 373 if status.action != kvrpcpb.Action_MinCommitTSPushed { 374 pushFail = true 375 return nil 376 } 377 pushed = append(pushed, l.TxnID) 378 } 379 } 380 return nil 381 } 382 383 for _, l := range locks { 384 err := resolve(l, false) 385 if err != nil { 386 msBeforeTxnExpired.update(0) 387 err = errors.Trace(err) 388 return msBeforeTxnExpired.value(), nil, err 389 } 390 } 391 if pushFail { 392 // If any of the lock fails to push minCommitTS, don't return the pushed array. 393 pushed = nil 394 } 395 396 if msBeforeTxnExpired.value() > 0 && len(pushed) == 0 { 397 // If len(pushed) > 0, the caller will not block on the locks, it push the minCommitTS instead. 398 metrics.LockResolverCountWithWaitExpired.Inc() 399 } 400 return msBeforeTxnExpired.value(), pushed, nil 401 } 402 403 // ResolveLocksForWrite resolves lock for write 404 func (lr *LockResolver) ResolveLocksForWrite(bo *retry.Backoffer, callerStartTS, callerForUpdateTS uint64, locks []*Lock) (int64, error) { 405 // The forWrite parameter is only useful for optimistic transactions which can avoid deadlock between large transactions, 406 // so only use forWrite if the callerForUpdateTS is zero. 407 msBeforeTxnExpired, _, err := lr.resolveLocks(bo, callerStartTS, locks, callerForUpdateTS == 0, false) 408 return msBeforeTxnExpired, err 409 } 410 411 type txnExpireTime struct { 412 initialized bool 413 txnExpire int64 414 } 415 416 func (t *txnExpireTime) update(lockExpire int64) { 417 if lockExpire <= 0 { 418 lockExpire = 0 419 } 420 if !t.initialized { 421 t.txnExpire = lockExpire 422 t.initialized = true 423 return 424 } 425 if lockExpire < t.txnExpire { 426 t.txnExpire = lockExpire 427 } 428 } 429 430 func (t *txnExpireTime) value() int64 { 431 if !t.initialized { 432 return 0 433 } 434 return t.txnExpire 435 } 436 437 const getTxnStatusMaxBackoff = 20000 438 439 // GetTxnStatus queries tikv-server for a txn's status (commit/rollback). 440 // If the primary key is still locked, it will launch a Rollback to abort it. 441 // To avoid unnecessarily aborting too many txns, it is wiser to wait a few 442 // seconds before calling it after Prewrite. 443 func (lr *LockResolver) GetTxnStatus(txnID uint64, callerStartTS uint64, primary []byte) (TxnStatus, error) { 444 var status TxnStatus 445 bo := retry.NewBackoffer(context.Background(), getTxnStatusMaxBackoff) 446 currentTS, err := lr.store.GetOracle().GetLowResolutionTimestamp(bo.GetCtx(), &oracle.Option{TxnScope: oracle.GlobalTxnScope}) 447 if err != nil { 448 return status, err 449 } 450 return lr.getTxnStatus(bo, txnID, primary, callerStartTS, currentTS, true, false, nil) 451 } 452 453 func (lr *LockResolver) getTxnStatusFromLock(bo *retry.Backoffer, l *Lock, callerStartTS uint64, forceSyncCommit bool) (TxnStatus, error) { 454 var currentTS uint64 455 var err error 456 var status TxnStatus 457 458 if l.TTL == 0 { 459 // NOTE: l.TTL = 0 is a special protocol!!! 460 // When the pessimistic txn prewrite meets locks of a txn, it should resolve the lock **unconditionally**. 461 // In this case, TiKV use lock TTL = 0 to notify TiDB, and TiDB should resolve the lock! 462 // Set currentTS to max uint64 to make the lock expired. 463 currentTS = math.MaxUint64 464 } else { 465 currentTS, err = lr.store.GetOracle().GetLowResolutionTimestamp(bo.GetCtx(), &oracle.Option{TxnScope: oracle.GlobalTxnScope}) 466 if err != nil { 467 return TxnStatus{}, err 468 } 469 } 470 471 rollbackIfNotExist := false 472 if _, err := util.EvalFailpoint("getTxnStatusDelay"); err == nil { 473 time.Sleep(100 * time.Millisecond) 474 } 475 for { 476 status, err = lr.getTxnStatus(bo, l.TxnID, l.Primary, callerStartTS, currentTS, rollbackIfNotExist, forceSyncCommit, l) 477 if err == nil { 478 return status, nil 479 } 480 // If the error is something other than txnNotFoundErr, throw the error (network 481 // unavailable, tikv down, backoff timeout etc) to the caller. 482 if _, ok := errors.Cause(err).(txnNotFoundErr); !ok { 483 return TxnStatus{}, err 484 } 485 486 if _, err := util.EvalFailpoint("txnNotFoundRetTTL"); err == nil { 487 return TxnStatus{ttl: l.TTL, action: kvrpcpb.Action_NoAction}, nil 488 } 489 490 // Handle txnNotFound error. 491 // getTxnStatus() returns it when the secondary locks exist while the primary lock doesn't. 492 // This is likely to happen in the concurrently prewrite when secondary regions 493 // success before the primary region. 494 if err := bo.Backoff(retry.BoTxnNotFound, err); err != nil { 495 logutil.Logger(bo.GetCtx()).Warn("getTxnStatusFromLock backoff fail", zap.Error(err)) 496 } 497 498 if lr.store.GetOracle().UntilExpired(l.TxnID, l.TTL, &oracle.Option{TxnScope: oracle.GlobalTxnScope}) <= 0 { 499 logutil.Logger(bo.GetCtx()).Warn("lock txn not found, lock has expired", 500 zap.Uint64("CallerStartTs", callerStartTS), 501 zap.Stringer("lock str", l)) 502 if l.LockType == kvrpcpb.Op_PessimisticLock { 503 if _, err := util.EvalFailpoint("txnExpireRetTTL"); err == nil { 504 return TxnStatus{action: kvrpcpb.Action_LockNotExistDoNothing}, 505 errors.New("error txn not found and lock expired") 506 } 507 } 508 // For pessimistic lock resolving, if the primary lock does not exist and rollbackIfNotExist is true, 509 // The Action_LockNotExistDoNothing will be returned as the status. 510 rollbackIfNotExist = true 511 } else { 512 if l.LockType == kvrpcpb.Op_PessimisticLock { 513 return TxnStatus{ttl: l.TTL}, nil 514 } 515 } 516 } 517 } 518 519 type txnNotFoundErr struct { 520 *kvrpcpb.TxnNotFound 521 } 522 523 func (e txnNotFoundErr) Error() string { 524 return e.TxnNotFound.String() 525 } 526 527 // getTxnStatus sends the CheckTxnStatus request to the TiKV server. 528 // When rollbackIfNotExist is false, the caller should be careful with the txnNotFoundErr error. 529 func (lr *LockResolver) getTxnStatus(bo *retry.Backoffer, txnID uint64, primary []byte, 530 callerStartTS, currentTS uint64, rollbackIfNotExist bool, forceSyncCommit bool, lockInfo *Lock) (TxnStatus, error) { 531 if s, ok := lr.getResolved(txnID); ok { 532 return s, nil 533 } 534 535 metrics.LockResolverCountWithQueryTxnStatus.Inc() 536 537 // CheckTxnStatus may meet the following cases: 538 // 1. LOCK 539 // 1.1 Lock expired -- orphan lock, fail to update TTL, crash recovery etc. 540 // 1.2 Lock TTL -- active transaction holding the lock. 541 // 2. NO LOCK 542 // 2.1 Txn Committed 543 // 2.2 Txn Rollbacked -- rollback itself, rollback by others, GC tomb etc. 544 // 2.3 No lock -- pessimistic lock rollback, concurrence prewrite. 545 546 var status TxnStatus 547 resolvingPessimisticLock := lockInfo != nil && lockInfo.LockType == kvrpcpb.Op_PessimisticLock 548 req := tikvrpc.NewRequest(tikvrpc.CmdCheckTxnStatus, &kvrpcpb.CheckTxnStatusRequest{ 549 PrimaryKey: primary, 550 LockTs: txnID, 551 CallerStartTs: callerStartTS, 552 CurrentTs: currentTS, 553 RollbackIfNotExist: rollbackIfNotExist, 554 ForceSyncCommit: forceSyncCommit, 555 ResolvingPessimisticLock: resolvingPessimisticLock, 556 }) 557 for { 558 loc, err := lr.store.GetRegionCache().LocateKey(bo, primary) 559 if err != nil { 560 return status, errors.Trace(err) 561 } 562 req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds()) 563 resp, err := lr.store.SendReq(bo, req, loc.Region, client.ReadTimeoutShort) 564 if err != nil { 565 return status, errors.Trace(err) 566 } 567 regionErr, err := resp.GetRegionError() 568 if err != nil { 569 return status, errors.Trace(err) 570 } 571 if regionErr != nil { 572 err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String())) 573 if err != nil { 574 return status, errors.Trace(err) 575 } 576 continue 577 } 578 if resp.Resp == nil { 579 return status, errors.Trace(tikverr.ErrBodyMissing) 580 } 581 cmdResp := resp.Resp.(*kvrpcpb.CheckTxnStatusResponse) 582 if keyErr := cmdResp.GetError(); keyErr != nil { 583 txnNotFound := keyErr.GetTxnNotFound() 584 if txnNotFound != nil { 585 return status, txnNotFoundErr{txnNotFound} 586 } 587 588 err = errors.Errorf("unexpected err: %s, tid: %v", keyErr, txnID) 589 logutil.BgLogger().Error("getTxnStatus error", zap.Error(err)) 590 return status, err 591 } 592 status.action = cmdResp.Action 593 status.primaryLock = cmdResp.LockInfo 594 595 if status.primaryLock != nil && status.primaryLock.UseAsyncCommit && !forceSyncCommit { 596 if !lr.store.GetOracle().IsExpired(txnID, cmdResp.LockTtl, &oracle.Option{TxnScope: oracle.GlobalTxnScope}) { 597 status.ttl = cmdResp.LockTtl 598 } 599 } else if cmdResp.LockTtl != 0 { 600 status.ttl = cmdResp.LockTtl 601 } else { 602 if cmdResp.CommitVersion == 0 { 603 metrics.LockResolverCountWithQueryTxnStatusRolledBack.Inc() 604 } else { 605 metrics.LockResolverCountWithQueryTxnStatusCommitted.Inc() 606 } 607 608 status.commitTS = cmdResp.CommitVersion 609 if status.StatusCacheable() { 610 lr.saveResolved(txnID, status) 611 } 612 } 613 614 return status, nil 615 } 616 } 617 618 // asyncResolveData is data contributed by multiple goroutines when resolving locks using the async commit protocol. All 619 // data should be protected by the mutex field. 620 type asyncResolveData struct { 621 mutex sync.Mutex 622 // If any key has been committed (missingLock is true), then this is the commit ts. In that case, all locks should 623 // be committed with the same commit timestamp. If no locks have been committed (missingLock is false), then we will 624 // use max(all min commit ts) from all locks; i.e., it is the commit ts we should use. Note that a secondary lock's 625 // commit ts may or may not be the same as the primary lock's min commit ts. 626 commitTs uint64 627 keys [][]byte 628 missingLock bool 629 } 630 631 type nonAsyncCommitLock struct{} 632 633 func (*nonAsyncCommitLock) Error() string { 634 return "CheckSecondaryLocks receives a non-async-commit lock" 635 } 636 637 // addKeys adds the keys from locks to data, keeping other fields up to date. startTS and commitTS are for the 638 // transaction being resolved. 639 // 640 // In the async commit protocol when checking locks, we send a list of keys to check and get back a list of locks. There 641 // will be a lock for every key which is locked. If there are fewer locks than keys, then a lock is missing because it 642 // has been committed, rolled back, or was never locked. 643 // 644 // In this function, locks is the list of locks, and expected is the number of keys. asyncResolveData.missingLock will be 645 // set to true if the lengths don't match. If the lengths do match, then the locks are added to asyncResolveData.locks 646 // and will need to be resolved by the caller. 647 func (data *asyncResolveData) addKeys(locks []*kvrpcpb.LockInfo, expected int, startTS uint64, commitTS uint64) error { 648 data.mutex.Lock() 649 defer data.mutex.Unlock() 650 651 // Check locks to see if any have been committed or rolled back. 652 if len(locks) < expected { 653 logutil.BgLogger().Debug("addKeys: lock has been committed or rolled back", zap.Uint64("commit ts", commitTS), zap.Uint64("start ts", startTS)) 654 // A lock is missing - the transaction must either have been rolled back or committed. 655 if !data.missingLock { 656 // commitTS == 0 => lock has been rolled back. 657 if commitTS != 0 && commitTS < data.commitTs { 658 return errors.Errorf("commit TS must be greater or equal to min commit TS: commit ts: %v, min commit ts: %v", commitTS, data.commitTs) 659 } 660 data.commitTs = commitTS 661 } 662 data.missingLock = true 663 664 if data.commitTs != commitTS { 665 return errors.Errorf("commit TS mismatch in async commit recovery: %v and %v", data.commitTs, commitTS) 666 } 667 668 // We do not need to resolve the remaining locks because TiKV will have resolved them as appropriate. 669 return nil 670 } 671 672 logutil.BgLogger().Debug("addKeys: all locks present", zap.Uint64("start ts", startTS)) 673 // Save all locks to be resolved. 674 for _, lockInfo := range locks { 675 if lockInfo.LockVersion != startTS { 676 err := errors.Errorf("unexpected timestamp, expected: %v, found: %v", startTS, lockInfo.LockVersion) 677 logutil.BgLogger().Error("addLocks error", zap.Error(err)) 678 return err 679 } 680 if !lockInfo.UseAsyncCommit { 681 return &nonAsyncCommitLock{} 682 } 683 if !data.missingLock && lockInfo.MinCommitTs > data.commitTs { 684 data.commitTs = lockInfo.MinCommitTs 685 } 686 data.keys = append(data.keys, lockInfo.Key) 687 } 688 689 return nil 690 } 691 692 func (lr *LockResolver) checkSecondaries(bo *retry.Backoffer, txnID uint64, curKeys [][]byte, curRegionID locate.RegionVerID, shared *asyncResolveData) error { 693 checkReq := &kvrpcpb.CheckSecondaryLocksRequest{ 694 Keys: curKeys, 695 StartVersion: txnID, 696 } 697 req := tikvrpc.NewRequest(tikvrpc.CmdCheckSecondaryLocks, checkReq) 698 metrics.LockResolverCountWithQueryCheckSecondaryLocks.Inc() 699 req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds()) 700 resp, err := lr.store.SendReq(bo, req, curRegionID, client.ReadTimeoutShort) 701 if err != nil { 702 return errors.Trace(err) 703 } 704 regionErr, err := resp.GetRegionError() 705 if err != nil { 706 return errors.Trace(err) 707 } 708 if regionErr != nil { 709 err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String())) 710 if err != nil { 711 return errors.Trace(err) 712 } 713 714 logutil.BgLogger().Debug("checkSecondaries: region error, regrouping", zap.Uint64("txn id", txnID), zap.Uint64("region", curRegionID.GetID())) 715 716 // If regions have changed, then we might need to regroup the keys. Since this should be rare and for the sake 717 // of simplicity, we will resolve regions sequentially. 718 regions, _, err := lr.store.GetRegionCache().GroupKeysByRegion(bo, curKeys, nil) 719 if err != nil { 720 return errors.Trace(err) 721 } 722 for regionID, keys := range regions { 723 // Recursion will terminate because the resolve request succeeds or the Backoffer reaches its limit. 724 if err = lr.checkSecondaries(bo, txnID, keys, regionID, shared); err != nil { 725 return err 726 } 727 } 728 return nil 729 } 730 if resp.Resp == nil { 731 return errors.Trace(tikverr.ErrBodyMissing) 732 } 733 734 checkResp := resp.Resp.(*kvrpcpb.CheckSecondaryLocksResponse) 735 return shared.addKeys(checkResp.Locks, len(curKeys), txnID, checkResp.CommitTs) 736 } 737 738 // resolveLockAsync resolves l assuming it was locked using the async commit protocol. 739 func (lr *LockResolver) resolveLockAsync(bo *retry.Backoffer, l *Lock, status TxnStatus) error { 740 metrics.LockResolverCountWithResolveAsync.Inc() 741 742 resolveData, err := lr.checkAllSecondaries(bo, l, &status) 743 if err != nil { 744 return err 745 } 746 747 status.commitTS = resolveData.commitTs 748 749 resolveData.keys = append(resolveData.keys, l.Primary) 750 keysByRegion, _, err := lr.store.GetRegionCache().GroupKeysByRegion(bo, resolveData.keys, nil) 751 if err != nil { 752 return errors.Trace(err) 753 } 754 755 logutil.BgLogger().Info("resolve async commit", zap.Uint64("startTS", l.TxnID), zap.Uint64("commitTS", status.commitTS)) 756 757 errChan := make(chan error, len(keysByRegion)) 758 // Resolve every lock in the transaction. 759 for region, locks := range keysByRegion { 760 curLocks := locks 761 curRegion := region 762 resolveBo, cancel := bo.Fork() 763 defer cancel() 764 765 go func() { 766 errChan <- lr.resolveRegionLocks(resolveBo, l, curRegion, curLocks, status) 767 }() 768 } 769 770 var errs []string 771 for range keysByRegion { 772 err1 := <-errChan 773 if err1 != nil { 774 errs = append(errs, err1.Error()) 775 } 776 } 777 778 if len(errs) > 0 { 779 return errors.Errorf("async commit recovery (sending ResolveLock) finished with errors: %v", errs) 780 } 781 782 return nil 783 } 784 785 // checkAllSecondaries checks the secondary locks of an async commit transaction to find out the final 786 // status of the transaction 787 func (lr *LockResolver) checkAllSecondaries(bo *retry.Backoffer, l *Lock, status *TxnStatus) (*asyncResolveData, error) { 788 regions, _, err := lr.store.GetRegionCache().GroupKeysByRegion(bo, status.primaryLock.Secondaries, nil) 789 if err != nil { 790 return nil, errors.Trace(err) 791 } 792 793 shared := asyncResolveData{ 794 mutex: sync.Mutex{}, 795 commitTs: status.primaryLock.MinCommitTs, 796 keys: [][]byte{}, 797 missingLock: false, 798 } 799 800 errChan := make(chan error, len(regions)) 801 for regionID, keys := range regions { 802 curRegionID := regionID 803 curKeys := keys 804 checkBo, cancel := bo.Fork() 805 defer cancel() 806 807 go func() { 808 errChan <- lr.checkSecondaries(checkBo, l.TxnID, curKeys, curRegionID, &shared) 809 }() 810 } 811 812 for range regions { 813 err := <-errChan 814 if err != nil { 815 return nil, err 816 } 817 } 818 819 return &shared, nil 820 } 821 822 // resolveRegionLocks is essentially the same as resolveLock, but we resolve all keys in the same region at the same time. 823 func (lr *LockResolver) resolveRegionLocks(bo *retry.Backoffer, l *Lock, region locate.RegionVerID, keys [][]byte, status TxnStatus) error { 824 lreq := &kvrpcpb.ResolveLockRequest{ 825 StartVersion: l.TxnID, 826 } 827 if status.IsCommitted() { 828 lreq.CommitVersion = status.CommitTS() 829 } 830 lreq.Keys = keys 831 req := tikvrpc.NewRequest(tikvrpc.CmdResolveLock, lreq) 832 req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds()) 833 resp, err := lr.store.SendReq(bo, req, region, client.ReadTimeoutShort) 834 if err != nil { 835 return errors.Trace(err) 836 } 837 838 regionErr, err := resp.GetRegionError() 839 if err != nil { 840 return errors.Trace(err) 841 } 842 if regionErr != nil { 843 err := bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String())) 844 if err != nil { 845 return errors.Trace(err) 846 } 847 848 logutil.BgLogger().Info("resolveRegionLocks region error, regrouping", zap.String("lock", l.String()), zap.Uint64("region", region.GetID())) 849 850 // Regroup locks. 851 regions, _, err := lr.store.GetRegionCache().GroupKeysByRegion(bo, keys, nil) 852 if err != nil { 853 return errors.Trace(err) 854 } 855 for regionID, keys := range regions { 856 // Recursion will terminate because the resolve request succeeds or the Backoffer reaches its limit. 857 if err = lr.resolveRegionLocks(bo, l, regionID, keys, status); err != nil { 858 return err 859 } 860 } 861 return nil 862 } 863 if resp.Resp == nil { 864 return errors.Trace(tikverr.ErrBodyMissing) 865 } 866 cmdResp := resp.Resp.(*kvrpcpb.ResolveLockResponse) 867 if keyErr := cmdResp.GetError(); keyErr != nil { 868 err = errors.Errorf("unexpected resolve err: %s, lock: %v", keyErr, l) 869 logutil.BgLogger().Error("resolveLock error", zap.Error(err)) 870 } 871 872 return nil 873 } 874 875 func (lr *LockResolver) resolveLock(bo *retry.Backoffer, l *Lock, status TxnStatus, lite bool, cleanRegions map[locate.RegionVerID]struct{}) error { 876 metrics.LockResolverCountWithResolveLocks.Inc() 877 resolveLite := lite || l.TxnSize < lr.resolveLockLiteThreshold 878 for { 879 loc, err := lr.store.GetRegionCache().LocateKey(bo, l.Key) 880 if err != nil { 881 return errors.Trace(err) 882 } 883 if _, ok := cleanRegions[loc.Region]; ok { 884 return nil 885 } 886 lreq := &kvrpcpb.ResolveLockRequest{ 887 StartVersion: l.TxnID, 888 } 889 if status.IsCommitted() { 890 lreq.CommitVersion = status.CommitTS() 891 } else { 892 logutil.BgLogger().Info("resolveLock rollback", zap.String("lock", l.String())) 893 } 894 895 if resolveLite { 896 // Only resolve specified keys when it is a small transaction, 897 // prevent from scanning the whole region in this case. 898 metrics.LockResolverCountWithResolveLockLite.Inc() 899 lreq.Keys = [][]byte{l.Key} 900 } 901 req := tikvrpc.NewRequest(tikvrpc.CmdResolveLock, lreq) 902 req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds()) 903 resp, err := lr.store.SendReq(bo, req, loc.Region, client.ReadTimeoutShort) 904 if err != nil { 905 return errors.Trace(err) 906 } 907 regionErr, err := resp.GetRegionError() 908 if err != nil { 909 return errors.Trace(err) 910 } 911 if regionErr != nil { 912 err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String())) 913 if err != nil { 914 return errors.Trace(err) 915 } 916 continue 917 } 918 if resp.Resp == nil { 919 return errors.Trace(tikverr.ErrBodyMissing) 920 } 921 cmdResp := resp.Resp.(*kvrpcpb.ResolveLockResponse) 922 if keyErr := cmdResp.GetError(); keyErr != nil { 923 err = errors.Errorf("unexpected resolve err: %s, lock: %v", keyErr, l) 924 logutil.BgLogger().Error("resolveLock error", zap.Error(err)) 925 return err 926 } 927 if !resolveLite { 928 cleanRegions[loc.Region] = struct{}{} 929 } 930 return nil 931 } 932 } 933 934 func (lr *LockResolver) resolvePessimisticLock(bo *retry.Backoffer, l *Lock, cleanRegions map[locate.RegionVerID]struct{}) error { 935 metrics.LockResolverCountWithResolveLocks.Inc() 936 for { 937 loc, err := lr.store.GetRegionCache().LocateKey(bo, l.Key) 938 if err != nil { 939 return errors.Trace(err) 940 } 941 if _, ok := cleanRegions[loc.Region]; ok { 942 return nil 943 } 944 forUpdateTS := l.LockForUpdateTS 945 if forUpdateTS == 0 { 946 forUpdateTS = math.MaxUint64 947 } 948 pessimisticRollbackReq := &kvrpcpb.PessimisticRollbackRequest{ 949 StartVersion: l.TxnID, 950 ForUpdateTs: forUpdateTS, 951 Keys: [][]byte{l.Key}, 952 } 953 req := tikvrpc.NewRequest(tikvrpc.CmdPessimisticRollback, pessimisticRollbackReq) 954 req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds()) 955 resp, err := lr.store.SendReq(bo, req, loc.Region, client.ReadTimeoutShort) 956 if err != nil { 957 return errors.Trace(err) 958 } 959 regionErr, err := resp.GetRegionError() 960 if err != nil { 961 return errors.Trace(err) 962 } 963 if regionErr != nil { 964 err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String())) 965 if err != nil { 966 return errors.Trace(err) 967 } 968 continue 969 } 970 if resp.Resp == nil { 971 return errors.Trace(tikverr.ErrBodyMissing) 972 } 973 cmdResp := resp.Resp.(*kvrpcpb.PessimisticRollbackResponse) 974 if keyErr := cmdResp.GetErrors(); len(keyErr) > 0 { 975 err = errors.Errorf("unexpected resolve pessimistic lock err: %s, lock: %v", keyErr[0], l) 976 logutil.Logger(bo.GetCtx()).Error("resolveLock error", zap.Error(err)) 977 return err 978 } 979 return nil 980 } 981 }