github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/txnwait/queue.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package txnwait 12 13 import ( 14 "bytes" 15 "context" 16 "sync/atomic" 17 "time" 18 19 "github.com/cockroachdb/cockroach/pkg/base" 20 "github.com/cockroachdb/cockroach/pkg/kv" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 24 "github.com/cockroachdb/cockroach/pkg/util/envutil" 25 "github.com/cockroachdb/cockroach/pkg/util/hlc" 26 "github.com/cockroachdb/cockroach/pkg/util/log" 27 "github.com/cockroachdb/cockroach/pkg/util/retry" 28 "github.com/cockroachdb/cockroach/pkg/util/stop" 29 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 30 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 31 "github.com/cockroachdb/cockroach/pkg/util/uuid" 32 ) 33 34 const maxWaitForQueryTxn = 50 * time.Millisecond 35 36 // TxnLivenessHeartbeatMultiplier specifies what multiple the transaction 37 // liveness threshold should be of the transaction heartbeat internval. 38 var TxnLivenessHeartbeatMultiplier = envutil.EnvOrDefaultInt( 39 "COCKROACH_TXN_LIVENESS_HEARTBEAT_MULTIPLIER", 5) 40 41 // TxnLivenessThreshold is the maximum duration between transaction heartbeats 42 // before the transaction is considered expired by Queue. It is exposed and 43 // mutable to allow tests to override it. 44 // 45 // Use TestingOverrideTxnLivenessThreshold to override the value in tests. 46 var TxnLivenessThreshold = time.Duration(TxnLivenessHeartbeatMultiplier) * base.DefaultTxnHeartbeatInterval 47 48 // TestingOverrideTxnLivenessThreshold allows tests to override the transaction 49 // liveness threshold. The function returns a closure that should be called to 50 // reset the value. 51 func TestingOverrideTxnLivenessThreshold(t time.Duration) func() { 52 old := TxnLivenessThreshold 53 TxnLivenessThreshold = t 54 return func() { 55 TxnLivenessThreshold = old 56 } 57 } 58 59 // ShouldPushImmediately returns whether the PushTxn request should 60 // proceed without queueing. This is true for pushes which are neither 61 // ABORT nor TIMESTAMP, but also for ABORT and TIMESTAMP pushes where 62 // the pushee has min priority or pusher has max priority. 63 func ShouldPushImmediately(req *roachpb.PushTxnRequest) bool { 64 if req.Force { 65 return true 66 } 67 if !(req.PushType == roachpb.PUSH_ABORT || req.PushType == roachpb.PUSH_TIMESTAMP) { 68 return true 69 } 70 p1, p2 := req.PusherTxn.Priority, req.PusheeTxn.Priority 71 if p1 > p2 && (p1 == enginepb.MaxTxnPriority || p2 == enginepb.MinTxnPriority) { 72 return true 73 } 74 return false 75 } 76 77 // isPushed returns whether the PushTxn request has already been 78 // fulfilled by the current transaction state. This may be true 79 // for transactions with pushed timestamps. 80 func isPushed(req *roachpb.PushTxnRequest, txn *roachpb.Transaction) bool { 81 return (txn.Status.IsFinalized() || 82 (req.PushType == roachpb.PUSH_TIMESTAMP && req.PushTo.LessEq(txn.WriteTimestamp))) 83 } 84 85 // TxnExpiration computes the timestamp after which the transaction will be 86 // considered expired. 87 func TxnExpiration(txn *roachpb.Transaction) hlc.Timestamp { 88 return txn.LastActive().Add(TxnLivenessThreshold.Nanoseconds(), 0) 89 } 90 91 // IsExpired is true if the given transaction is expired. 92 func IsExpired(now hlc.Timestamp, txn *roachpb.Transaction) bool { 93 return TxnExpiration(txn).Less(now) 94 } 95 96 // createPushTxnResponse returns a PushTxnResponse struct with a 97 // copy of the supplied transaction. It is necessary to fully copy 98 // each field in the transaction to avoid race conditions. 99 func createPushTxnResponse(txn *roachpb.Transaction) *roachpb.PushTxnResponse { 100 return &roachpb.PushTxnResponse{PusheeTxn: *txn} 101 } 102 103 // A waitingPush represents a PushTxn command that is waiting on the 104 // pushee transaction to commit or abort. It maintains a transitive 105 // set of all txns which are waiting on this txn in order to detect 106 // dependency cycles. 107 type waitingPush struct { 108 req *roachpb.PushTxnRequest 109 // pending channel receives updated, pushed txn or nil if queue is cleared. 110 pending chan *roachpb.Transaction 111 mu struct { 112 syncutil.Mutex 113 dependents map[uuid.UUID]struct{} // transitive set of txns waiting on this txn 114 } 115 } 116 117 // A waitingQueries object represents one or more QueryTxn commands that are 118 // waiting on the same target transaction to change status or acquire new 119 // dependencies. 120 type waitingQueries struct { 121 pending chan struct{} 122 count int 123 } 124 125 // A pendingTxn represents a transaction waiting to be pushed by one 126 // or more PushTxn requests. 127 type pendingTxn struct { 128 txn atomic.Value // the most recent txn record 129 waitingPushes []*waitingPush 130 } 131 132 func (pt *pendingTxn) getTxn() *roachpb.Transaction { 133 return pt.txn.Load().(*roachpb.Transaction) 134 } 135 136 func (pt *pendingTxn) getDependentsSet() map[uuid.UUID]struct{} { 137 set := map[uuid.UUID]struct{}{} 138 for _, push := range pt.waitingPushes { 139 if id := push.req.PusherTxn.ID; id != (uuid.UUID{}) { 140 set[id] = struct{}{} 141 push.mu.Lock() 142 if push.mu.dependents != nil { 143 for txnID := range push.mu.dependents { 144 set[txnID] = struct{}{} 145 } 146 } 147 push.mu.Unlock() 148 } 149 } 150 return set 151 } 152 153 // Config contains the dependencies to construct a Queue. 154 type Config struct { 155 RangeDesc *roachpb.RangeDescriptor 156 DB *kv.DB 157 Clock *hlc.Clock 158 Stopper *stop.Stopper 159 Metrics *Metrics 160 Knobs TestingKnobs 161 } 162 163 // TestingKnobs represents testing knobs for a Queue. 164 type TestingKnobs struct { 165 // OnTxnWaitEnqueue is called when a would-be pusher joins a wait queue. 166 OnPusherBlocked func(ctx context.Context, push *roachpb.PushTxnRequest) 167 // OnTxnUpdate is called by Queue.UpdateTxn. 168 OnTxnUpdate func(ctx context.Context, txn *roachpb.Transaction) 169 } 170 171 // Queue enqueues PushTxn requests which are waiting on extant txns 172 // with conflicting intents to abort or commit. 173 // 174 // Internally, it maintains a map from extant txn IDs to queues of pending 175 // PushTxn requests. 176 // 177 // When a write intent is encountered, the command which encountered it (called 178 // the "pusher" here) initiates a PushTxn request to determine the disposition 179 // of the intent's transaction (called the "pushee" here). This queue is where a 180 // PushTxn request will wait if it discovers that the pushee's transaction is 181 // still pending, and cannot be otherwise aborted or pushed forward. 182 // 183 // Queue is thread safe. 184 type Queue struct { 185 cfg Config 186 mu struct { 187 syncutil.Mutex 188 txns map[uuid.UUID]*pendingTxn 189 queries map[uuid.UUID]*waitingQueries 190 } 191 } 192 193 // NewQueue instantiates a new Queue. 194 func NewQueue(cfg Config) *Queue { 195 return &Queue{cfg: cfg} 196 } 197 198 // Enable allows transactions to be enqueued and waiting pushers 199 // added. This method must be idempotent as it can be invoked multiple 200 // times as range leases are updated for the same replica. 201 func (q *Queue) Enable() { 202 q.mu.Lock() 203 defer q.mu.Unlock() 204 if q.mu.txns == nil { 205 q.mu.txns = map[uuid.UUID]*pendingTxn{} 206 } 207 if q.mu.queries == nil { 208 q.mu.queries = map[uuid.UUID]*waitingQueries{} 209 } 210 } 211 212 // Clear empties the queue and returns all waiters. This method should 213 // be invoked when the replica loses or transfers its lease. If 214 // `disable` is true, future transactions may not be enqueued or 215 // waiting pushers added. Call Enable() once the lease is again 216 // acquired by the replica. 217 func (q *Queue) Clear(disable bool) { 218 q.mu.Lock() 219 var pushWaiters []chan *roachpb.Transaction 220 for _, pt := range q.mu.txns { 221 for _, w := range pt.waitingPushes { 222 pushWaiters = append(pushWaiters, w.pending) 223 } 224 pt.waitingPushes = nil 225 } 226 227 queryWaiters := q.mu.queries 228 queryWaitersCount := 0 229 for _, waitingQueries := range queryWaiters { 230 queryWaitersCount += waitingQueries.count 231 } 232 233 metrics := q.cfg.Metrics 234 metrics.PusheeWaiting.Dec(int64(len(q.mu.txns))) 235 metrics.PusherWaiting.Dec(int64(len(pushWaiters))) 236 metrics.QueryWaiting.Dec(int64(queryWaitersCount)) 237 238 if log.V(1) { 239 log.Infof( 240 context.Background(), 241 "clearing %d push waiters and %d query waiters", 242 len(pushWaiters), 243 queryWaitersCount, 244 ) 245 } 246 247 if disable { 248 q.mu.txns = nil 249 q.mu.queries = nil 250 } else { 251 q.mu.txns = map[uuid.UUID]*pendingTxn{} 252 q.mu.queries = map[uuid.UUID]*waitingQueries{} 253 } 254 q.mu.Unlock() 255 256 // Send on the pending push waiter channels outside of the mutex lock. 257 for _, w := range pushWaiters { 258 w <- nil 259 } 260 // Close query waiters outside of the mutex lock. 261 for _, w := range queryWaiters { 262 close(w.pending) 263 } 264 } 265 266 // IsEnabled is true if the queue is enabled. 267 func (q *Queue) IsEnabled() bool { 268 q.mu.Lock() 269 defer q.mu.Unlock() 270 return q.mu.txns != nil 271 } 272 273 // OnRangeDescUpdated informs the Queue that its Range has been updated. 274 func (q *Queue) OnRangeDescUpdated(desc *roachpb.RangeDescriptor) { 275 q.mu.Lock() 276 defer q.mu.Unlock() 277 q.cfg.RangeDesc = desc 278 } 279 280 // RangeContainsKeyLocked returns whether the Queue's Range contains the 281 // specified key. 282 func (q *Queue) RangeContainsKeyLocked(key roachpb.Key) bool { 283 return kvserverbase.ContainsKey(q.cfg.RangeDesc, key) 284 } 285 286 // EnqueueTxn creates a new pendingTxn for the target txn of a failed 287 // PushTxn command. Subsequent PushTxn requests for the same txn 288 // will be enqueued behind the pendingTxn via MaybeWait(). 289 func (q *Queue) EnqueueTxn(txn *roachpb.Transaction) { 290 q.mu.Lock() 291 defer q.mu.Unlock() 292 if q.mu.txns == nil { 293 // Not enabled; do nothing. 294 return 295 } 296 // If the txn which failed to push is already pending, update the 297 // transaction status. 298 if pt, ok := q.mu.txns[txn.ID]; ok { 299 pt.txn.Store(txn) 300 } else { 301 q.cfg.Metrics.PusheeWaiting.Inc(1) 302 pt = &pendingTxn{} 303 pt.txn.Store(txn) 304 q.mu.txns[txn.ID] = pt 305 } 306 } 307 308 // UpdateTxn is invoked to update a transaction's status after a successful 309 // PushTxn or EndTxn command. It unblocks all pending waiters. 310 func (q *Queue) UpdateTxn(ctx context.Context, txn *roachpb.Transaction) { 311 txn.AssertInitialized(ctx) 312 q.mu.Lock() 313 if f := q.cfg.Knobs.OnTxnUpdate; f != nil { 314 f(ctx, txn) 315 } 316 317 q.releaseWaitingQueriesLocked(ctx, txn.ID) 318 319 if q.mu.txns == nil { 320 // Not enabled; do nothing. 321 q.mu.Unlock() 322 return 323 } 324 325 pending, ok := q.mu.txns[txn.ID] 326 if !ok { 327 q.mu.Unlock() 328 return 329 } 330 waitingPushes := pending.waitingPushes 331 pending.waitingPushes = nil 332 delete(q.mu.txns, txn.ID) 333 pending.txn.Store(txn) 334 q.mu.Unlock() 335 336 metrics := q.cfg.Metrics 337 metrics.PusheeWaiting.Dec(1) 338 metrics.PusherWaiting.Dec(int64(len(waitingPushes))) 339 340 if log.V(1) && len(waitingPushes) > 0 { 341 log.Infof(ctx, "updating %d push waiters for %s", len(waitingPushes), txn.ID.Short()) 342 } 343 // Send on pending waiter channels outside of the mutex lock. 344 for _, w := range waitingPushes { 345 w.pending <- txn 346 } 347 } 348 349 // GetDependents returns a slice of transactions waiting on the specified 350 // txn either directly or indirectly. 351 func (q *Queue) GetDependents(txnID uuid.UUID) []uuid.UUID { 352 q.mu.Lock() 353 defer q.mu.Unlock() 354 if q.mu.txns == nil { 355 // Not enabled; do nothing. 356 return nil 357 } 358 if pending, ok := q.mu.txns[txnID]; ok { 359 set := pending.getDependentsSet() 360 dependents := make([]uuid.UUID, 0, len(set)) 361 for txnID := range set { 362 dependents = append(dependents, txnID) 363 } 364 return dependents 365 } 366 return nil 367 } 368 369 // isTxnUpdated returns whether the transaction specified in 370 // the QueryTxnRequest has had its status or priority updated 371 // or whether the known set of dependent transactions has 372 // changed. 373 func (q *Queue) isTxnUpdated(pending *pendingTxn, req *roachpb.QueryTxnRequest) bool { 374 // First check whether txn status or priority has changed. 375 txn := pending.getTxn() 376 if txn.Status.IsFinalized() || txn.Priority > req.Txn.Priority { 377 return true 378 } 379 // Next, see if there is any discrepancy in the set of known dependents. 380 set := pending.getDependentsSet() 381 if len(req.KnownWaitingTxns) != len(set) { 382 return true 383 } 384 for _, txnID := range req.KnownWaitingTxns { 385 if _, ok := set[txnID]; !ok { 386 return true 387 } 388 } 389 return false 390 } 391 392 func (q *Queue) releaseWaitingQueriesLocked(ctx context.Context, txnID uuid.UUID) { 393 if w, ok := q.mu.queries[txnID]; ok { 394 metrics := q.cfg.Metrics 395 metrics.QueryWaiting.Dec(int64(w.count)) 396 log.VEventf(ctx, 2, "releasing %d waiting queries for %s", w.count, txnID.Short()) 397 close(w.pending) 398 delete(q.mu.queries, txnID) 399 } 400 } 401 402 // MaybeWaitForPush checks whether there is a queue already 403 // established for pushing the transaction. If not, or if the PushTxn 404 // request isn't queueable, return immediately. If there is a queue, 405 // enqueue this request as a waiter and enter a select loop waiting 406 // for resolution. 407 // 408 // If the transaction is successfully pushed while this method is waiting, 409 // the first return value is a non-nil PushTxnResponse object. 410 func (q *Queue) MaybeWaitForPush( 411 ctx context.Context, req *roachpb.PushTxnRequest, 412 ) (*roachpb.PushTxnResponse, *roachpb.Error) { 413 if ShouldPushImmediately(req) { 414 return nil, nil 415 } 416 417 q.mu.Lock() 418 // If the txn wait queue is not enabled or if the request is not 419 // contained within the replica, do nothing. The request can fall 420 // outside of the replica after a split or merge. Note that the 421 // ContainsKey check is done under the txn wait queue's lock to 422 // ensure that it's not cleared before an incorrect insertion happens. 423 if q.mu.txns == nil || !q.RangeContainsKeyLocked(req.Key) { 424 q.mu.Unlock() 425 return nil, nil 426 } 427 428 // If there's no pending queue for this txn, return not pushed. If 429 // already pushed, return push success. 430 pending, ok := q.mu.txns[req.PusheeTxn.ID] 431 if !ok { 432 q.mu.Unlock() 433 return nil, nil 434 } 435 if txn := pending.getTxn(); isPushed(req, txn) { 436 q.mu.Unlock() 437 return createPushTxnResponse(txn), nil 438 } 439 440 push := &waitingPush{ 441 req: req, 442 pending: make(chan *roachpb.Transaction, 1), 443 } 444 pending.waitingPushes = append(pending.waitingPushes, push) 445 if f := q.cfg.Knobs.OnPusherBlocked; f != nil { 446 f(ctx, req) 447 } 448 // Because we're adding another dependent on the pending 449 // transaction, send on the waiting queries' channel to 450 // indicate there is a new dependent and they should proceed 451 // to execute the QueryTxn command. 452 q.releaseWaitingQueriesLocked(ctx, req.PusheeTxn.ID) 453 454 if req.PusherTxn.ID != (uuid.UUID{}) { 455 log.VEventf( 456 ctx, 457 2, 458 "%s pushing %s (%d pending)", 459 req.PusherTxn.ID.Short(), 460 req.PusheeTxn.ID.Short(), 461 len(pending.waitingPushes), 462 ) 463 } else { 464 log.VEventf(ctx, 2, "pushing %s (%d pending)", req.PusheeTxn.ID.Short(), len(pending.waitingPushes)) 465 } 466 q.mu.Unlock() 467 468 // Wait for any updates to the pusher txn to be notified when 469 // status, priority, or dependents (for deadlock detection) have 470 // changed. 471 var queryPusherCh <-chan *roachpb.Transaction // accepts updates to the pusher txn 472 var queryPusherErrCh <-chan *roachpb.Error // accepts errors querying the pusher txn 473 var readyCh chan struct{} // signaled when pusher txn should be queried 474 475 // Query the pusher if it's a valid read-write transaction. 476 if req.PusherTxn.ID != uuid.Nil && req.PusherTxn.IsLocking() { 477 // Create a context which will be canceled once this call completes. 478 // This ensures that the goroutine created to query the pusher txn 479 // is properly cleaned up. 480 var cancel func() 481 ctx, cancel = context.WithCancel(ctx) 482 readyCh = make(chan struct{}, 1) 483 queryPusherCh, queryPusherErrCh = q.startQueryPusherTxn(ctx, push, readyCh) 484 // Ensure that the pusher querying goroutine is complete at exit. 485 defer func() { 486 cancel() 487 if queryPusherErrCh != nil { 488 <-queryPusherErrCh 489 } 490 }() 491 } 492 pusherPriority := req.PusherTxn.Priority 493 pusheePriority := req.PusheeTxn.Priority 494 495 metrics := q.cfg.Metrics 496 metrics.PusherWaiting.Inc(1) 497 tBegin := timeutil.Now() 498 defer func() { metrics.PusherWaitTime.RecordValue(timeutil.Since(tBegin).Nanoseconds()) }() 499 500 slowTimerThreshold := time.Minute 501 slowTimer := timeutil.NewTimer() 502 defer slowTimer.Stop() 503 slowTimer.Reset(slowTimerThreshold) 504 505 var pusheeTxnTimer timeutil.Timer 506 defer pusheeTxnTimer.Stop() 507 // The first time we want to check the pushee's txn record immediately: 508 // the pushee might be gone by the time the pusher gets here if it cleaned 509 // itself up after the pusher saw an intent but before it entered this 510 // queue. 511 pusheeTxnTimer.Reset(0) 512 for { 513 select { 514 case <-slowTimer.C: 515 slowTimer.Read = true 516 metrics.PusherSlow.Inc(1) 517 log.Warningf(ctx, "pusher %s: have been waiting %.2fs for pushee %s", 518 req.PusherTxn.ID.Short(), 519 timeutil.Since(tBegin).Seconds(), 520 req.PusheeTxn.ID.Short(), 521 ) 522 defer func() { 523 metrics.PusherSlow.Dec(1) 524 log.Warningf(ctx, "pusher %s: finished waiting after %.2fs for pushee %s", 525 req.PusherTxn.ID.Short(), 526 timeutil.Since(tBegin).Seconds(), 527 req.PusheeTxn.ID.Short(), 528 ) 529 }() 530 case <-ctx.Done(): 531 // Caller has given up. 532 log.VEvent(ctx, 2, "pusher giving up due to context cancellation") 533 return nil, roachpb.NewError(ctx.Err()) 534 case <-q.cfg.Stopper.ShouldQuiesce(): 535 // Let the push out so that they can be sent looking elsewhere. 536 return nil, nil 537 case txn := <-push.pending: 538 log.VEventf(ctx, 2, "result of pending push: %v", txn) 539 // If txn is nil, the queue was cleared, presumably because the 540 // replica lost the range lease. Return not pushed so request 541 // proceeds and is redirected to the new range lease holder. 542 if txn == nil { 543 return nil, nil 544 } 545 // Transaction was committed, aborted or had its timestamp 546 // pushed. If this PushTxn request is satisfied, return 547 // successful PushTxn response. 548 if isPushed(req, txn) { 549 log.VEvent(ctx, 2, "push request is satisfied") 550 return createPushTxnResponse(txn), nil 551 } 552 // If not successfully pushed, return not pushed so request proceeds. 553 log.VEvent(ctx, 2, "not pushed; returning to caller") 554 return nil, nil 555 556 case <-pusheeTxnTimer.C: 557 log.VEvent(ctx, 2, "querying pushee") 558 pusheeTxnTimer.Read = true 559 // Periodically check whether the pushee txn has been abandoned. 560 updatedPushee, _, pErr := q.queryTxnStatus( 561 ctx, req.PusheeTxn, false, nil, q.cfg.Clock.Now(), 562 ) 563 if pErr != nil { 564 return nil, pErr 565 } else if updatedPushee == nil { 566 // Continue with push. 567 log.VEvent(ctx, 2, "pushee not found, push should now succeed") 568 return nil, nil 569 } 570 pusheePriority = updatedPushee.Priority 571 pending.txn.Store(updatedPushee) 572 if updatedPushee.Status.IsFinalized() { 573 log.VEvent(ctx, 2, "push request is satisfied") 574 if updatedPushee.Status == roachpb.ABORTED { 575 // Inform any other waiting pushers that the transaction is now 576 // finalized. Intuitively we would expect that if any pusher was 577 // stuck waiting for the transaction to be finalized then it would 578 // have heard about the update when the transaction record moved 579 // into its finalized state. This is correct for cases where a 580 // command explicitly wrote the transaction record with a finalized 581 // status. 582 // 583 // However, this does not account for the case where a transaction 584 // becomes uncommittable due a loss of resolution in the store's 585 // timestamp cache. In that case, a transaction may suddenly become 586 // uncommittable without an associated write to its record. When 587 // this happens, no one else will immediately inform the other 588 // pushers about the uncommittable transaction. Eventually the 589 // pushee's coordinator will come along and roll back its record, 590 // but that's only if the pushee isn't itself waiting on the result 591 // of one of the pushers here. If there is such a dependency cycle 592 // then the other pushers may have to wait for up to the transaction 593 // expiration to query the pushee again and notice that the pushee 594 // is now uncommittable. 595 q.UpdateTxn(ctx, updatedPushee) 596 } 597 return createPushTxnResponse(updatedPushee), nil 598 } 599 if IsExpired(q.cfg.Clock.Now(), updatedPushee) { 600 log.VEventf(ctx, 1, "pushing expired txn %s", req.PusheeTxn.ID.Short()) 601 return nil, nil 602 } 603 // Set the timer to check for the pushee txn's expiration. 604 expiration := TxnExpiration(updatedPushee).GoTime() 605 now := q.cfg.Clock.Now().GoTime() 606 pusheeTxnTimer.Reset(expiration.Sub(now)) 607 608 case updatedPusher := <-queryPusherCh: 609 switch updatedPusher.Status { 610 case roachpb.COMMITTED: 611 log.VEventf(ctx, 1, "pusher committed: %v", updatedPusher) 612 return nil, roachpb.NewErrorWithTxn(roachpb.NewTransactionCommittedStatusError(), updatedPusher) 613 case roachpb.ABORTED: 614 log.VEventf(ctx, 1, "pusher aborted: %v", updatedPusher) 615 return nil, roachpb.NewErrorWithTxn( 616 roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_PUSHER_ABORTED), updatedPusher) 617 } 618 log.VEventf(ctx, 2, "pusher was updated: %v", updatedPusher) 619 if updatedPusher.Priority > pusherPriority { 620 pusherPriority = updatedPusher.Priority 621 } 622 623 // Check for dependency cycle to find and break deadlocks. 624 push.mu.Lock() 625 _, haveDependency := push.mu.dependents[req.PusheeTxn.ID] 626 dependents := make([]string, 0, len(push.mu.dependents)) 627 for id := range push.mu.dependents { 628 dependents = append(dependents, id.Short()) 629 } 630 log.VEventf( 631 ctx, 632 2, 633 "%s (%d), pushing %s (%d), has dependencies=%s", 634 req.PusherTxn.ID.Short(), 635 pusherPriority, 636 req.PusheeTxn.ID.Short(), 637 pusheePriority, 638 dependents, 639 ) 640 push.mu.Unlock() 641 642 // Since the pusher has been updated, clear any waiting queries 643 // so that they continue with a query of new dependents added here. 644 q.mu.Lock() 645 q.releaseWaitingQueriesLocked(ctx, req.PusheeTxn.ID) 646 q.mu.Unlock() 647 648 if haveDependency { 649 // Break the deadlock if the pusher has higher priority. 650 p1, p2 := pusheePriority, pusherPriority 651 if p1 < p2 || (p1 == p2 && bytes.Compare(req.PusheeTxn.ID.GetBytes(), req.PusherTxn.ID.GetBytes()) < 0) { 652 log.VEventf( 653 ctx, 654 1, 655 "%s breaking deadlock by force push of %s; dependencies=%s", 656 req.PusherTxn.ID.Short(), 657 req.PusheeTxn.ID.Short(), 658 dependents, 659 ) 660 metrics.DeadlocksTotal.Inc(1) 661 return q.forcePushAbort(ctx, req) 662 } 663 } 664 // Signal the pusher query txn loop to continue. 665 readyCh <- struct{}{} 666 667 case pErr := <-queryPusherErrCh: 668 queryPusherErrCh = nil 669 return nil, pErr 670 } 671 } 672 } 673 674 // MaybeWaitForQuery checks whether there is a queue already 675 // established for pushing the transaction. If not, or if the QueryTxn 676 // request hasn't specified WaitForUpdate, return immediately. If 677 // there is a queue, enqueue this request as a waiter and enter a 678 // select loop waiting for any updates to the target transaction. 679 func (q *Queue) MaybeWaitForQuery( 680 ctx context.Context, req *roachpb.QueryTxnRequest, 681 ) *roachpb.Error { 682 if !req.WaitForUpdate { 683 return nil 684 } 685 metrics := q.cfg.Metrics 686 q.mu.Lock() 687 // If the txn wait queue is not enabled or if the request is not 688 // contained within the replica, do nothing. The request can fall 689 // outside of the replica after a split or merge. Note that the 690 // ContainsKey check is done under the txn wait queue's lock to 691 // ensure that it's not cleared before an incorrect insertion happens. 692 if q.mu.txns == nil || !q.RangeContainsKeyLocked(req.Key) { 693 q.mu.Unlock() 694 return nil 695 } 696 697 var maxWaitCh <-chan time.Time 698 // If the transaction we're waiting to query has a queue of txns 699 // in turn waiting on it, and is _already_ updated from what the 700 // caller is expecting, return to query the updates immediately. 701 if pending, ok := q.mu.txns[req.Txn.ID]; ok && q.isTxnUpdated(pending, req) { 702 q.mu.Unlock() 703 return nil 704 } else if !ok { 705 // If the transaction we're querying has no queue established, 706 // it's possible that it's no longer pending. To avoid waiting 707 // forever for an update that isn't forthcoming, we set a maximum 708 // time to wait for updates before allowing the query to 709 // proceed. 710 maxWaitCh = time.After(maxWaitForQueryTxn) 711 } 712 713 // Add a new query to wait for updates to the transaction. If a query 714 // already exists, we can just increment its reference count. 715 query, ok := q.mu.queries[req.Txn.ID] 716 if ok { 717 query.count++ 718 } else { 719 query = &waitingQueries{ 720 pending: make(chan struct{}), 721 count: 1, 722 } 723 q.mu.queries[req.Txn.ID] = query 724 } 725 metrics.QueryWaiting.Inc(1) 726 q.mu.Unlock() 727 728 tBegin := timeutil.Now() 729 defer func() { metrics.QueryWaitTime.RecordValue(timeutil.Since(tBegin).Nanoseconds()) }() 730 731 // When we return, make sure to unregister the query so that it doesn't 732 // leak. If query.pending if closed, the query will have already been 733 // cleaned up, so this will be a no-op. 734 defer func() { 735 q.mu.Lock() 736 if query == q.mu.queries[req.Txn.ID] { 737 query.count-- 738 metrics.QueryWaiting.Dec(1) 739 if query.count == 0 { 740 delete(q.mu.queries, req.Txn.ID) 741 } 742 } 743 q.mu.Unlock() 744 }() 745 746 log.VEventf(ctx, 2, "waiting on query for %s", req.Txn.ID.Short()) 747 select { 748 case <-ctx.Done(): 749 // Caller has given up. 750 return roachpb.NewError(ctx.Err()) 751 case <-maxWaitCh: 752 return nil 753 case <-query.pending: 754 return nil 755 } 756 } 757 758 // startQueryPusherTxn starts a goroutine to send QueryTxn requests to 759 // fetch updates to the pusher's own transaction until the context is 760 // done or an error occurs while querying. Returns two channels: one 761 // for updated versions of the pusher transaction, and the other for 762 // errors encountered while querying. The readyCh parameter is used by 763 // the caller to signal when the next query to the pusher should be 764 // sent, and is mostly intended to avoid an extra RPC in the event that 765 // the QueryTxn returns sufficient information to determine a dependency 766 // cycle exists and must be broken. 767 // 768 // Note that the contents of the pusher transaction including updated 769 // priority and set of known waiting transactions (dependents) are 770 // accumulated over iterations and supplied with each successive 771 // invocation of QueryTxn in order to avoid busy querying. 772 func (q *Queue) startQueryPusherTxn( 773 ctx context.Context, push *waitingPush, readyCh <-chan struct{}, 774 ) (<-chan *roachpb.Transaction, <-chan *roachpb.Error) { 775 ch := make(chan *roachpb.Transaction, 1) 776 errCh := make(chan *roachpb.Error, 1) 777 push.mu.Lock() 778 var waitingTxns []uuid.UUID 779 if push.mu.dependents != nil { 780 waitingTxns = make([]uuid.UUID, 0, len(push.mu.dependents)) 781 for txnID := range push.mu.dependents { 782 waitingTxns = append(waitingTxns, txnID) 783 } 784 } 785 pusher := push.req.PusherTxn.Clone() 786 push.mu.Unlock() 787 788 if err := q.cfg.Stopper.RunAsyncTask( 789 ctx, "monitoring pusher txn", 790 func(ctx context.Context) { 791 // We use a backoff/retry here in case the pusher transaction 792 // doesn't yet exist. 793 for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { 794 var pErr *roachpb.Error 795 var updatedPusher *roachpb.Transaction 796 updatedPusher, waitingTxns, pErr = q.queryTxnStatus( 797 ctx, pusher.TxnMeta, true, waitingTxns, q.cfg.Clock.Now(), 798 ) 799 if pErr != nil { 800 errCh <- pErr 801 return 802 } else if updatedPusher == nil { 803 // No pusher to query; the pusher's record hasn't yet been 804 // created. Continue in order to backoff and retry. 805 // TODO(nvanbenschoten): we shouldn't hit this case in a 2.2 806 // cluster now that QueryTxn requests synthesize 807 // transactions from their provided TxnMeta. However, we 808 // need to keep the logic while we want to support 809 // compatibility with 2.1 nodes. Remove this in 2.3. 810 log.Event(ctx, "no pusher found; backing off") 811 continue 812 } 813 814 // Update the pending pusher's set of dependents. These accumulate 815 // and are used to propagate the transitive set of dependencies for 816 // distributed deadlock detection. 817 push.mu.Lock() 818 if push.mu.dependents == nil { 819 push.mu.dependents = map[uuid.UUID]struct{}{} 820 } 821 for _, txnID := range waitingTxns { 822 push.mu.dependents[txnID] = struct{}{} 823 } 824 push.mu.Unlock() 825 826 // Send an update of the pusher txn. 827 pusher.Update(updatedPusher) 828 ch <- pusher 829 830 // Wait for context cancellation or indication on readyCh that the 831 // push waiter requires another query of the pusher txn. 832 select { 833 case <-ctx.Done(): 834 errCh <- roachpb.NewError(ctx.Err()) 835 return 836 case <-readyCh: 837 } 838 // Reset the retry to query again immediately. 839 r.Reset() 840 } 841 errCh <- roachpb.NewError(ctx.Err()) 842 }); err != nil { 843 errCh <- roachpb.NewError(err) 844 } 845 return ch, errCh 846 } 847 848 // queryTxnStatus does a "query" push on the specified transaction 849 // to glean possible changes, such as a higher timestamp and/or 850 // priority. It turns out this is necessary while a request is waiting 851 // to push a transaction, as two txns can have circular dependencies 852 // where both are unable to push because they have different 853 // information about their own txns. 854 // 855 // Returns the updated transaction (or nil if not updated) as well as 856 // the list of transactions which are waiting on the updated txn. 857 func (q *Queue) queryTxnStatus( 858 ctx context.Context, 859 txnMeta enginepb.TxnMeta, 860 wait bool, 861 dependents []uuid.UUID, 862 now hlc.Timestamp, 863 ) (*roachpb.Transaction, []uuid.UUID, *roachpb.Error) { 864 b := &kv.Batch{} 865 b.Header.Timestamp = q.cfg.Clock.Now() 866 b.AddRawRequest(&roachpb.QueryTxnRequest{ 867 RequestHeader: roachpb.RequestHeader{ 868 Key: txnMeta.Key, 869 }, 870 Txn: txnMeta, 871 WaitForUpdate: wait, 872 KnownWaitingTxns: dependents, 873 }) 874 if err := q.cfg.DB.Run(ctx, b); err != nil { 875 // TODO(tschottdorf): 876 // We shouldn't catch an error here (unless it's from the AbortSpan, in 877 // which case we would not get the crucial information that we've been 878 // aborted; instead we'll go around thinking we're still PENDING, 879 // potentially caught in an infinite loop). Same issue: we must not use 880 // RunWithResponse on this level - we're trying to do internal kv stuff 881 // through the public interface. Likely not exercised in tests, so I'd be 882 // ok tackling this separately. 883 // 884 // Scenario: 885 // - we're aborted and don't know if we have a read-write conflict 886 // - the push above fails and we get a WriteIntentError 887 // - we try to update our transaction (right here, and if we don't we might 888 // be stuck in a race, that's why we do this - the txn proto we're using 889 // might be outdated) 890 // - query fails because our home range has the AbortSpan populated we catch 891 // a TransactionAbortedError, but with a pending transaction (since we lose 892 // the original txn, and you just use the txn we had...) 893 // 894 // so something is sketchy here, but it should all resolve nicely when we 895 // don't use store.db for these internal requests any more. 896 return nil, nil, roachpb.NewError(err) 897 } 898 br := b.RawResponse() 899 resp := br.Responses[0].GetInner().(*roachpb.QueryTxnResponse) 900 // ID can be nil if no HeartbeatTxn has been sent yet and we're talking to a 901 // 2.1 node. 902 // TODO(nvanbenschoten): Remove this in 2.3. 903 if updatedTxn := &resp.QueriedTxn; updatedTxn.ID != (uuid.UUID{}) { 904 return updatedTxn, resp.WaitingTxns, nil 905 } 906 return nil, nil, nil 907 } 908 909 // forcePushAbort upgrades the PushTxn request to a "forced" push abort, which 910 // overrides the normal expiration and priority checks to ensure that it aborts 911 // the pushee. This mechanism can be used to break deadlocks between conflicting 912 // transactions. 913 func (q *Queue) forcePushAbort( 914 ctx context.Context, req *roachpb.PushTxnRequest, 915 ) (*roachpb.PushTxnResponse, *roachpb.Error) { 916 log.VEventf(ctx, 1, "force pushing %v to break deadlock", req.PusheeTxn.ID) 917 forcePush := *req 918 forcePush.Force = true 919 forcePush.PushType = roachpb.PUSH_ABORT 920 b := &kv.Batch{} 921 b.Header.Timestamp = q.cfg.Clock.Now() 922 b.AddRawRequest(&forcePush) 923 if err := q.cfg.DB.Run(ctx, b); err != nil { 924 return nil, b.MustPErr() 925 } 926 return b.RawResponse().Responses[0].GetPushTxn(), nil 927 } 928 929 // TrackedTxns returns a (newly minted) set containing the transaction IDs which 930 // are being tracked (i.e. waited on). 931 // 932 // For testing purposes only. 933 func (q *Queue) TrackedTxns() map[uuid.UUID]struct{} { 934 m := make(map[uuid.UUID]struct{}) 935 q.mu.Lock() 936 for k := range q.mu.txns { 937 m[k] = struct{}{} 938 } 939 q.mu.Unlock() 940 return m 941 }