github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/txnwait/queue.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package txnwait
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"sync/atomic"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/base"
    20  	"github.com/cockroachdb/cockroach/pkg/kv"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    22  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    23  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    24  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    25  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    28  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    29  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    30  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    31  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    32  )
    33  
    34  const maxWaitForQueryTxn = 50 * time.Millisecond
    35  
    36  // TxnLivenessHeartbeatMultiplier specifies what multiple the transaction
    37  // liveness threshold should be of the transaction heartbeat internval.
    38  var TxnLivenessHeartbeatMultiplier = envutil.EnvOrDefaultInt(
    39  	"COCKROACH_TXN_LIVENESS_HEARTBEAT_MULTIPLIER", 5)
    40  
    41  // TxnLivenessThreshold is the maximum duration between transaction heartbeats
    42  // before the transaction is considered expired by Queue. It is exposed and
    43  // mutable to allow tests to override it.
    44  //
    45  // Use TestingOverrideTxnLivenessThreshold to override the value in tests.
    46  var TxnLivenessThreshold = time.Duration(TxnLivenessHeartbeatMultiplier) * base.DefaultTxnHeartbeatInterval
    47  
    48  // TestingOverrideTxnLivenessThreshold allows tests to override the transaction
    49  // liveness threshold. The function returns a closure that should be called to
    50  // reset the value.
    51  func TestingOverrideTxnLivenessThreshold(t time.Duration) func() {
    52  	old := TxnLivenessThreshold
    53  	TxnLivenessThreshold = t
    54  	return func() {
    55  		TxnLivenessThreshold = old
    56  	}
    57  }
    58  
    59  // ShouldPushImmediately returns whether the PushTxn request should
    60  // proceed without queueing. This is true for pushes which are neither
    61  // ABORT nor TIMESTAMP, but also for ABORT and TIMESTAMP pushes where
    62  // the pushee has min priority or pusher has max priority.
    63  func ShouldPushImmediately(req *roachpb.PushTxnRequest) bool {
    64  	if req.Force {
    65  		return true
    66  	}
    67  	if !(req.PushType == roachpb.PUSH_ABORT || req.PushType == roachpb.PUSH_TIMESTAMP) {
    68  		return true
    69  	}
    70  	p1, p2 := req.PusherTxn.Priority, req.PusheeTxn.Priority
    71  	if p1 > p2 && (p1 == enginepb.MaxTxnPriority || p2 == enginepb.MinTxnPriority) {
    72  		return true
    73  	}
    74  	return false
    75  }
    76  
    77  // isPushed returns whether the PushTxn request has already been
    78  // fulfilled by the current transaction state. This may be true
    79  // for transactions with pushed timestamps.
    80  func isPushed(req *roachpb.PushTxnRequest, txn *roachpb.Transaction) bool {
    81  	return (txn.Status.IsFinalized() ||
    82  		(req.PushType == roachpb.PUSH_TIMESTAMP && req.PushTo.LessEq(txn.WriteTimestamp)))
    83  }
    84  
    85  // TxnExpiration computes the timestamp after which the transaction will be
    86  // considered expired.
    87  func TxnExpiration(txn *roachpb.Transaction) hlc.Timestamp {
    88  	return txn.LastActive().Add(TxnLivenessThreshold.Nanoseconds(), 0)
    89  }
    90  
    91  // IsExpired is true if the given transaction is expired.
    92  func IsExpired(now hlc.Timestamp, txn *roachpb.Transaction) bool {
    93  	return TxnExpiration(txn).Less(now)
    94  }
    95  
    96  // createPushTxnResponse returns a PushTxnResponse struct with a
    97  // copy of the supplied transaction. It is necessary to fully copy
    98  // each field in the transaction to avoid race conditions.
    99  func createPushTxnResponse(txn *roachpb.Transaction) *roachpb.PushTxnResponse {
   100  	return &roachpb.PushTxnResponse{PusheeTxn: *txn}
   101  }
   102  
   103  // A waitingPush represents a PushTxn command that is waiting on the
   104  // pushee transaction to commit or abort. It maintains a transitive
   105  // set of all txns which are waiting on this txn in order to detect
   106  // dependency cycles.
   107  type waitingPush struct {
   108  	req *roachpb.PushTxnRequest
   109  	// pending channel receives updated, pushed txn or nil if queue is cleared.
   110  	pending chan *roachpb.Transaction
   111  	mu      struct {
   112  		syncutil.Mutex
   113  		dependents map[uuid.UUID]struct{} // transitive set of txns waiting on this txn
   114  	}
   115  }
   116  
   117  // A waitingQueries object represents one or more QueryTxn commands that are
   118  // waiting on the same target transaction to change status or acquire new
   119  // dependencies.
   120  type waitingQueries struct {
   121  	pending chan struct{}
   122  	count   int
   123  }
   124  
   125  // A pendingTxn represents a transaction waiting to be pushed by one
   126  // or more PushTxn requests.
   127  type pendingTxn struct {
   128  	txn           atomic.Value // the most recent txn record
   129  	waitingPushes []*waitingPush
   130  }
   131  
   132  func (pt *pendingTxn) getTxn() *roachpb.Transaction {
   133  	return pt.txn.Load().(*roachpb.Transaction)
   134  }
   135  
   136  func (pt *pendingTxn) getDependentsSet() map[uuid.UUID]struct{} {
   137  	set := map[uuid.UUID]struct{}{}
   138  	for _, push := range pt.waitingPushes {
   139  		if id := push.req.PusherTxn.ID; id != (uuid.UUID{}) {
   140  			set[id] = struct{}{}
   141  			push.mu.Lock()
   142  			if push.mu.dependents != nil {
   143  				for txnID := range push.mu.dependents {
   144  					set[txnID] = struct{}{}
   145  				}
   146  			}
   147  			push.mu.Unlock()
   148  		}
   149  	}
   150  	return set
   151  }
   152  
   153  // Config contains the dependencies to construct a Queue.
   154  type Config struct {
   155  	RangeDesc *roachpb.RangeDescriptor
   156  	DB        *kv.DB
   157  	Clock     *hlc.Clock
   158  	Stopper   *stop.Stopper
   159  	Metrics   *Metrics
   160  	Knobs     TestingKnobs
   161  }
   162  
   163  // TestingKnobs represents testing knobs for a Queue.
   164  type TestingKnobs struct {
   165  	// OnTxnWaitEnqueue is called when a would-be pusher joins a wait queue.
   166  	OnPusherBlocked func(ctx context.Context, push *roachpb.PushTxnRequest)
   167  	// OnTxnUpdate is called by Queue.UpdateTxn.
   168  	OnTxnUpdate func(ctx context.Context, txn *roachpb.Transaction)
   169  }
   170  
   171  // Queue enqueues PushTxn requests which are waiting on extant txns
   172  // with conflicting intents to abort or commit.
   173  //
   174  // Internally, it maintains a map from extant txn IDs to queues of pending
   175  // PushTxn requests.
   176  //
   177  // When a write intent is encountered, the command which encountered it (called
   178  // the "pusher" here) initiates a PushTxn request to determine the disposition
   179  // of the intent's transaction (called the "pushee" here). This queue is where a
   180  // PushTxn request will wait if it discovers that the pushee's transaction is
   181  // still pending, and cannot be otherwise aborted or pushed forward.
   182  //
   183  // Queue is thread safe.
   184  type Queue struct {
   185  	cfg Config
   186  	mu  struct {
   187  		syncutil.Mutex
   188  		txns    map[uuid.UUID]*pendingTxn
   189  		queries map[uuid.UUID]*waitingQueries
   190  	}
   191  }
   192  
   193  // NewQueue instantiates a new Queue.
   194  func NewQueue(cfg Config) *Queue {
   195  	return &Queue{cfg: cfg}
   196  }
   197  
   198  // Enable allows transactions to be enqueued and waiting pushers
   199  // added. This method must be idempotent as it can be invoked multiple
   200  // times as range leases are updated for the same replica.
   201  func (q *Queue) Enable() {
   202  	q.mu.Lock()
   203  	defer q.mu.Unlock()
   204  	if q.mu.txns == nil {
   205  		q.mu.txns = map[uuid.UUID]*pendingTxn{}
   206  	}
   207  	if q.mu.queries == nil {
   208  		q.mu.queries = map[uuid.UUID]*waitingQueries{}
   209  	}
   210  }
   211  
   212  // Clear empties the queue and returns all waiters. This method should
   213  // be invoked when the replica loses or transfers its lease. If
   214  // `disable` is true, future transactions may not be enqueued or
   215  // waiting pushers added. Call Enable() once the lease is again
   216  // acquired by the replica.
   217  func (q *Queue) Clear(disable bool) {
   218  	q.mu.Lock()
   219  	var pushWaiters []chan *roachpb.Transaction
   220  	for _, pt := range q.mu.txns {
   221  		for _, w := range pt.waitingPushes {
   222  			pushWaiters = append(pushWaiters, w.pending)
   223  		}
   224  		pt.waitingPushes = nil
   225  	}
   226  
   227  	queryWaiters := q.mu.queries
   228  	queryWaitersCount := 0
   229  	for _, waitingQueries := range queryWaiters {
   230  		queryWaitersCount += waitingQueries.count
   231  	}
   232  
   233  	metrics := q.cfg.Metrics
   234  	metrics.PusheeWaiting.Dec(int64(len(q.mu.txns)))
   235  	metrics.PusherWaiting.Dec(int64(len(pushWaiters)))
   236  	metrics.QueryWaiting.Dec(int64(queryWaitersCount))
   237  
   238  	if log.V(1) {
   239  		log.Infof(
   240  			context.Background(),
   241  			"clearing %d push waiters and %d query waiters",
   242  			len(pushWaiters),
   243  			queryWaitersCount,
   244  		)
   245  	}
   246  
   247  	if disable {
   248  		q.mu.txns = nil
   249  		q.mu.queries = nil
   250  	} else {
   251  		q.mu.txns = map[uuid.UUID]*pendingTxn{}
   252  		q.mu.queries = map[uuid.UUID]*waitingQueries{}
   253  	}
   254  	q.mu.Unlock()
   255  
   256  	// Send on the pending push waiter channels outside of the mutex lock.
   257  	for _, w := range pushWaiters {
   258  		w <- nil
   259  	}
   260  	// Close query waiters outside of the mutex lock.
   261  	for _, w := range queryWaiters {
   262  		close(w.pending)
   263  	}
   264  }
   265  
   266  // IsEnabled is true if the queue is enabled.
   267  func (q *Queue) IsEnabled() bool {
   268  	q.mu.Lock()
   269  	defer q.mu.Unlock()
   270  	return q.mu.txns != nil
   271  }
   272  
   273  // OnRangeDescUpdated informs the Queue that its Range has been updated.
   274  func (q *Queue) OnRangeDescUpdated(desc *roachpb.RangeDescriptor) {
   275  	q.mu.Lock()
   276  	defer q.mu.Unlock()
   277  	q.cfg.RangeDesc = desc
   278  }
   279  
   280  // RangeContainsKeyLocked returns whether the Queue's Range contains the
   281  // specified key.
   282  func (q *Queue) RangeContainsKeyLocked(key roachpb.Key) bool {
   283  	return kvserverbase.ContainsKey(q.cfg.RangeDesc, key)
   284  }
   285  
   286  // EnqueueTxn creates a new pendingTxn for the target txn of a failed
   287  // PushTxn command. Subsequent PushTxn requests for the same txn
   288  // will be enqueued behind the pendingTxn via MaybeWait().
   289  func (q *Queue) EnqueueTxn(txn *roachpb.Transaction) {
   290  	q.mu.Lock()
   291  	defer q.mu.Unlock()
   292  	if q.mu.txns == nil {
   293  		// Not enabled; do nothing.
   294  		return
   295  	}
   296  	// If the txn which failed to push is already pending, update the
   297  	// transaction status.
   298  	if pt, ok := q.mu.txns[txn.ID]; ok {
   299  		pt.txn.Store(txn)
   300  	} else {
   301  		q.cfg.Metrics.PusheeWaiting.Inc(1)
   302  		pt = &pendingTxn{}
   303  		pt.txn.Store(txn)
   304  		q.mu.txns[txn.ID] = pt
   305  	}
   306  }
   307  
   308  // UpdateTxn is invoked to update a transaction's status after a successful
   309  // PushTxn or EndTxn command. It unblocks all pending waiters.
   310  func (q *Queue) UpdateTxn(ctx context.Context, txn *roachpb.Transaction) {
   311  	txn.AssertInitialized(ctx)
   312  	q.mu.Lock()
   313  	if f := q.cfg.Knobs.OnTxnUpdate; f != nil {
   314  		f(ctx, txn)
   315  	}
   316  
   317  	q.releaseWaitingQueriesLocked(ctx, txn.ID)
   318  
   319  	if q.mu.txns == nil {
   320  		// Not enabled; do nothing.
   321  		q.mu.Unlock()
   322  		return
   323  	}
   324  
   325  	pending, ok := q.mu.txns[txn.ID]
   326  	if !ok {
   327  		q.mu.Unlock()
   328  		return
   329  	}
   330  	waitingPushes := pending.waitingPushes
   331  	pending.waitingPushes = nil
   332  	delete(q.mu.txns, txn.ID)
   333  	pending.txn.Store(txn)
   334  	q.mu.Unlock()
   335  
   336  	metrics := q.cfg.Metrics
   337  	metrics.PusheeWaiting.Dec(1)
   338  	metrics.PusherWaiting.Dec(int64(len(waitingPushes)))
   339  
   340  	if log.V(1) && len(waitingPushes) > 0 {
   341  		log.Infof(ctx, "updating %d push waiters for %s", len(waitingPushes), txn.ID.Short())
   342  	}
   343  	// Send on pending waiter channels outside of the mutex lock.
   344  	for _, w := range waitingPushes {
   345  		w.pending <- txn
   346  	}
   347  }
   348  
   349  // GetDependents returns a slice of transactions waiting on the specified
   350  // txn either directly or indirectly.
   351  func (q *Queue) GetDependents(txnID uuid.UUID) []uuid.UUID {
   352  	q.mu.Lock()
   353  	defer q.mu.Unlock()
   354  	if q.mu.txns == nil {
   355  		// Not enabled; do nothing.
   356  		return nil
   357  	}
   358  	if pending, ok := q.mu.txns[txnID]; ok {
   359  		set := pending.getDependentsSet()
   360  		dependents := make([]uuid.UUID, 0, len(set))
   361  		for txnID := range set {
   362  			dependents = append(dependents, txnID)
   363  		}
   364  		return dependents
   365  	}
   366  	return nil
   367  }
   368  
   369  // isTxnUpdated returns whether the transaction specified in
   370  // the QueryTxnRequest has had its status or priority updated
   371  // or whether the known set of dependent transactions has
   372  // changed.
   373  func (q *Queue) isTxnUpdated(pending *pendingTxn, req *roachpb.QueryTxnRequest) bool {
   374  	// First check whether txn status or priority has changed.
   375  	txn := pending.getTxn()
   376  	if txn.Status.IsFinalized() || txn.Priority > req.Txn.Priority {
   377  		return true
   378  	}
   379  	// Next, see if there is any discrepancy in the set of known dependents.
   380  	set := pending.getDependentsSet()
   381  	if len(req.KnownWaitingTxns) != len(set) {
   382  		return true
   383  	}
   384  	for _, txnID := range req.KnownWaitingTxns {
   385  		if _, ok := set[txnID]; !ok {
   386  			return true
   387  		}
   388  	}
   389  	return false
   390  }
   391  
   392  func (q *Queue) releaseWaitingQueriesLocked(ctx context.Context, txnID uuid.UUID) {
   393  	if w, ok := q.mu.queries[txnID]; ok {
   394  		metrics := q.cfg.Metrics
   395  		metrics.QueryWaiting.Dec(int64(w.count))
   396  		log.VEventf(ctx, 2, "releasing %d waiting queries for %s", w.count, txnID.Short())
   397  		close(w.pending)
   398  		delete(q.mu.queries, txnID)
   399  	}
   400  }
   401  
   402  // MaybeWaitForPush checks whether there is a queue already
   403  // established for pushing the transaction. If not, or if the PushTxn
   404  // request isn't queueable, return immediately. If there is a queue,
   405  // enqueue this request as a waiter and enter a select loop waiting
   406  // for resolution.
   407  //
   408  // If the transaction is successfully pushed while this method is waiting,
   409  // the first return value is a non-nil PushTxnResponse object.
   410  func (q *Queue) MaybeWaitForPush(
   411  	ctx context.Context, req *roachpb.PushTxnRequest,
   412  ) (*roachpb.PushTxnResponse, *roachpb.Error) {
   413  	if ShouldPushImmediately(req) {
   414  		return nil, nil
   415  	}
   416  
   417  	q.mu.Lock()
   418  	// If the txn wait queue is not enabled or if the request is not
   419  	// contained within the replica, do nothing. The request can fall
   420  	// outside of the replica after a split or merge. Note that the
   421  	// ContainsKey check is done under the txn wait queue's lock to
   422  	// ensure that it's not cleared before an incorrect insertion happens.
   423  	if q.mu.txns == nil || !q.RangeContainsKeyLocked(req.Key) {
   424  		q.mu.Unlock()
   425  		return nil, nil
   426  	}
   427  
   428  	// If there's no pending queue for this txn, return not pushed. If
   429  	// already pushed, return push success.
   430  	pending, ok := q.mu.txns[req.PusheeTxn.ID]
   431  	if !ok {
   432  		q.mu.Unlock()
   433  		return nil, nil
   434  	}
   435  	if txn := pending.getTxn(); isPushed(req, txn) {
   436  		q.mu.Unlock()
   437  		return createPushTxnResponse(txn), nil
   438  	}
   439  
   440  	push := &waitingPush{
   441  		req:     req,
   442  		pending: make(chan *roachpb.Transaction, 1),
   443  	}
   444  	pending.waitingPushes = append(pending.waitingPushes, push)
   445  	if f := q.cfg.Knobs.OnPusherBlocked; f != nil {
   446  		f(ctx, req)
   447  	}
   448  	// Because we're adding another dependent on the pending
   449  	// transaction, send on the waiting queries' channel to
   450  	// indicate there is a new dependent and they should proceed
   451  	// to execute the QueryTxn command.
   452  	q.releaseWaitingQueriesLocked(ctx, req.PusheeTxn.ID)
   453  
   454  	if req.PusherTxn.ID != (uuid.UUID{}) {
   455  		log.VEventf(
   456  			ctx,
   457  			2,
   458  			"%s pushing %s (%d pending)",
   459  			req.PusherTxn.ID.Short(),
   460  			req.PusheeTxn.ID.Short(),
   461  			len(pending.waitingPushes),
   462  		)
   463  	} else {
   464  		log.VEventf(ctx, 2, "pushing %s (%d pending)", req.PusheeTxn.ID.Short(), len(pending.waitingPushes))
   465  	}
   466  	q.mu.Unlock()
   467  
   468  	// Wait for any updates to the pusher txn to be notified when
   469  	// status, priority, or dependents (for deadlock detection) have
   470  	// changed.
   471  	var queryPusherCh <-chan *roachpb.Transaction // accepts updates to the pusher txn
   472  	var queryPusherErrCh <-chan *roachpb.Error    // accepts errors querying the pusher txn
   473  	var readyCh chan struct{}                     // signaled when pusher txn should be queried
   474  
   475  	// Query the pusher if it's a valid read-write transaction.
   476  	if req.PusherTxn.ID != uuid.Nil && req.PusherTxn.IsLocking() {
   477  		// Create a context which will be canceled once this call completes.
   478  		// This ensures that the goroutine created to query the pusher txn
   479  		// is properly cleaned up.
   480  		var cancel func()
   481  		ctx, cancel = context.WithCancel(ctx)
   482  		readyCh = make(chan struct{}, 1)
   483  		queryPusherCh, queryPusherErrCh = q.startQueryPusherTxn(ctx, push, readyCh)
   484  		// Ensure that the pusher querying goroutine is complete at exit.
   485  		defer func() {
   486  			cancel()
   487  			if queryPusherErrCh != nil {
   488  				<-queryPusherErrCh
   489  			}
   490  		}()
   491  	}
   492  	pusherPriority := req.PusherTxn.Priority
   493  	pusheePriority := req.PusheeTxn.Priority
   494  
   495  	metrics := q.cfg.Metrics
   496  	metrics.PusherWaiting.Inc(1)
   497  	tBegin := timeutil.Now()
   498  	defer func() { metrics.PusherWaitTime.RecordValue(timeutil.Since(tBegin).Nanoseconds()) }()
   499  
   500  	slowTimerThreshold := time.Minute
   501  	slowTimer := timeutil.NewTimer()
   502  	defer slowTimer.Stop()
   503  	slowTimer.Reset(slowTimerThreshold)
   504  
   505  	var pusheeTxnTimer timeutil.Timer
   506  	defer pusheeTxnTimer.Stop()
   507  	// The first time we want to check the pushee's txn record immediately:
   508  	// the pushee might be gone by the time the pusher gets here if it cleaned
   509  	// itself up after the pusher saw an intent but before it entered this
   510  	// queue.
   511  	pusheeTxnTimer.Reset(0)
   512  	for {
   513  		select {
   514  		case <-slowTimer.C:
   515  			slowTimer.Read = true
   516  			metrics.PusherSlow.Inc(1)
   517  			log.Warningf(ctx, "pusher %s: have been waiting %.2fs for pushee %s",
   518  				req.PusherTxn.ID.Short(),
   519  				timeutil.Since(tBegin).Seconds(),
   520  				req.PusheeTxn.ID.Short(),
   521  			)
   522  			defer func() {
   523  				metrics.PusherSlow.Dec(1)
   524  				log.Warningf(ctx, "pusher %s: finished waiting after %.2fs for pushee %s",
   525  					req.PusherTxn.ID.Short(),
   526  					timeutil.Since(tBegin).Seconds(),
   527  					req.PusheeTxn.ID.Short(),
   528  				)
   529  			}()
   530  		case <-ctx.Done():
   531  			// Caller has given up.
   532  			log.VEvent(ctx, 2, "pusher giving up due to context cancellation")
   533  			return nil, roachpb.NewError(ctx.Err())
   534  		case <-q.cfg.Stopper.ShouldQuiesce():
   535  			// Let the push out so that they can be sent looking elsewhere.
   536  			return nil, nil
   537  		case txn := <-push.pending:
   538  			log.VEventf(ctx, 2, "result of pending push: %v", txn)
   539  			// If txn is nil, the queue was cleared, presumably because the
   540  			// replica lost the range lease. Return not pushed so request
   541  			// proceeds and is redirected to the new range lease holder.
   542  			if txn == nil {
   543  				return nil, nil
   544  			}
   545  			// Transaction was committed, aborted or had its timestamp
   546  			// pushed. If this PushTxn request is satisfied, return
   547  			// successful PushTxn response.
   548  			if isPushed(req, txn) {
   549  				log.VEvent(ctx, 2, "push request is satisfied")
   550  				return createPushTxnResponse(txn), nil
   551  			}
   552  			// If not successfully pushed, return not pushed so request proceeds.
   553  			log.VEvent(ctx, 2, "not pushed; returning to caller")
   554  			return nil, nil
   555  
   556  		case <-pusheeTxnTimer.C:
   557  			log.VEvent(ctx, 2, "querying pushee")
   558  			pusheeTxnTimer.Read = true
   559  			// Periodically check whether the pushee txn has been abandoned.
   560  			updatedPushee, _, pErr := q.queryTxnStatus(
   561  				ctx, req.PusheeTxn, false, nil, q.cfg.Clock.Now(),
   562  			)
   563  			if pErr != nil {
   564  				return nil, pErr
   565  			} else if updatedPushee == nil {
   566  				// Continue with push.
   567  				log.VEvent(ctx, 2, "pushee not found, push should now succeed")
   568  				return nil, nil
   569  			}
   570  			pusheePriority = updatedPushee.Priority
   571  			pending.txn.Store(updatedPushee)
   572  			if updatedPushee.Status.IsFinalized() {
   573  				log.VEvent(ctx, 2, "push request is satisfied")
   574  				if updatedPushee.Status == roachpb.ABORTED {
   575  					// Inform any other waiting pushers that the transaction is now
   576  					// finalized. Intuitively we would expect that if any pusher was
   577  					// stuck waiting for the transaction to be finalized then it would
   578  					// have heard about the update when the transaction record moved
   579  					// into its finalized state. This is correct for cases where a
   580  					// command explicitly wrote the transaction record with a finalized
   581  					// status.
   582  					//
   583  					// However, this does not account for the case where a transaction
   584  					// becomes uncommittable due a loss of resolution in the store's
   585  					// timestamp cache. In that case, a transaction may suddenly become
   586  					// uncommittable without an associated write to its record. When
   587  					// this happens, no one else will immediately inform the other
   588  					// pushers about the uncommittable transaction. Eventually the
   589  					// pushee's coordinator will come along and roll back its record,
   590  					// but that's only if the pushee isn't itself waiting on the result
   591  					// of one of the pushers here. If there is such a dependency cycle
   592  					// then the other pushers may have to wait for up to the transaction
   593  					// expiration to query the pushee again and notice that the pushee
   594  					// is now uncommittable.
   595  					q.UpdateTxn(ctx, updatedPushee)
   596  				}
   597  				return createPushTxnResponse(updatedPushee), nil
   598  			}
   599  			if IsExpired(q.cfg.Clock.Now(), updatedPushee) {
   600  				log.VEventf(ctx, 1, "pushing expired txn %s", req.PusheeTxn.ID.Short())
   601  				return nil, nil
   602  			}
   603  			// Set the timer to check for the pushee txn's expiration.
   604  			expiration := TxnExpiration(updatedPushee).GoTime()
   605  			now := q.cfg.Clock.Now().GoTime()
   606  			pusheeTxnTimer.Reset(expiration.Sub(now))
   607  
   608  		case updatedPusher := <-queryPusherCh:
   609  			switch updatedPusher.Status {
   610  			case roachpb.COMMITTED:
   611  				log.VEventf(ctx, 1, "pusher committed: %v", updatedPusher)
   612  				return nil, roachpb.NewErrorWithTxn(roachpb.NewTransactionCommittedStatusError(), updatedPusher)
   613  			case roachpb.ABORTED:
   614  				log.VEventf(ctx, 1, "pusher aborted: %v", updatedPusher)
   615  				return nil, roachpb.NewErrorWithTxn(
   616  					roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_PUSHER_ABORTED), updatedPusher)
   617  			}
   618  			log.VEventf(ctx, 2, "pusher was updated: %v", updatedPusher)
   619  			if updatedPusher.Priority > pusherPriority {
   620  				pusherPriority = updatedPusher.Priority
   621  			}
   622  
   623  			// Check for dependency cycle to find and break deadlocks.
   624  			push.mu.Lock()
   625  			_, haveDependency := push.mu.dependents[req.PusheeTxn.ID]
   626  			dependents := make([]string, 0, len(push.mu.dependents))
   627  			for id := range push.mu.dependents {
   628  				dependents = append(dependents, id.Short())
   629  			}
   630  			log.VEventf(
   631  				ctx,
   632  				2,
   633  				"%s (%d), pushing %s (%d), has dependencies=%s",
   634  				req.PusherTxn.ID.Short(),
   635  				pusherPriority,
   636  				req.PusheeTxn.ID.Short(),
   637  				pusheePriority,
   638  				dependents,
   639  			)
   640  			push.mu.Unlock()
   641  
   642  			// Since the pusher has been updated, clear any waiting queries
   643  			// so that they continue with a query of new dependents added here.
   644  			q.mu.Lock()
   645  			q.releaseWaitingQueriesLocked(ctx, req.PusheeTxn.ID)
   646  			q.mu.Unlock()
   647  
   648  			if haveDependency {
   649  				// Break the deadlock if the pusher has higher priority.
   650  				p1, p2 := pusheePriority, pusherPriority
   651  				if p1 < p2 || (p1 == p2 && bytes.Compare(req.PusheeTxn.ID.GetBytes(), req.PusherTxn.ID.GetBytes()) < 0) {
   652  					log.VEventf(
   653  						ctx,
   654  						1,
   655  						"%s breaking deadlock by force push of %s; dependencies=%s",
   656  						req.PusherTxn.ID.Short(),
   657  						req.PusheeTxn.ID.Short(),
   658  						dependents,
   659  					)
   660  					metrics.DeadlocksTotal.Inc(1)
   661  					return q.forcePushAbort(ctx, req)
   662  				}
   663  			}
   664  			// Signal the pusher query txn loop to continue.
   665  			readyCh <- struct{}{}
   666  
   667  		case pErr := <-queryPusherErrCh:
   668  			queryPusherErrCh = nil
   669  			return nil, pErr
   670  		}
   671  	}
   672  }
   673  
   674  // MaybeWaitForQuery checks whether there is a queue already
   675  // established for pushing the transaction. If not, or if the QueryTxn
   676  // request hasn't specified WaitForUpdate, return immediately. If
   677  // there is a queue, enqueue this request as a waiter and enter a
   678  // select loop waiting for any updates to the target transaction.
   679  func (q *Queue) MaybeWaitForQuery(
   680  	ctx context.Context, req *roachpb.QueryTxnRequest,
   681  ) *roachpb.Error {
   682  	if !req.WaitForUpdate {
   683  		return nil
   684  	}
   685  	metrics := q.cfg.Metrics
   686  	q.mu.Lock()
   687  	// If the txn wait queue is not enabled or if the request is not
   688  	// contained within the replica, do nothing. The request can fall
   689  	// outside of the replica after a split or merge. Note that the
   690  	// ContainsKey check is done under the txn wait queue's lock to
   691  	// ensure that it's not cleared before an incorrect insertion happens.
   692  	if q.mu.txns == nil || !q.RangeContainsKeyLocked(req.Key) {
   693  		q.mu.Unlock()
   694  		return nil
   695  	}
   696  
   697  	var maxWaitCh <-chan time.Time
   698  	// If the transaction we're waiting to query has a queue of txns
   699  	// in turn waiting on it, and is _already_ updated from what the
   700  	// caller is expecting, return to query the updates immediately.
   701  	if pending, ok := q.mu.txns[req.Txn.ID]; ok && q.isTxnUpdated(pending, req) {
   702  		q.mu.Unlock()
   703  		return nil
   704  	} else if !ok {
   705  		// If the transaction we're querying has no queue established,
   706  		// it's possible that it's no longer pending. To avoid waiting
   707  		// forever for an update that isn't forthcoming, we set a maximum
   708  		// time to wait for updates before allowing the query to
   709  		// proceed.
   710  		maxWaitCh = time.After(maxWaitForQueryTxn)
   711  	}
   712  
   713  	// Add a new query to wait for updates to the transaction. If a query
   714  	// already exists, we can just increment its reference count.
   715  	query, ok := q.mu.queries[req.Txn.ID]
   716  	if ok {
   717  		query.count++
   718  	} else {
   719  		query = &waitingQueries{
   720  			pending: make(chan struct{}),
   721  			count:   1,
   722  		}
   723  		q.mu.queries[req.Txn.ID] = query
   724  	}
   725  	metrics.QueryWaiting.Inc(1)
   726  	q.mu.Unlock()
   727  
   728  	tBegin := timeutil.Now()
   729  	defer func() { metrics.QueryWaitTime.RecordValue(timeutil.Since(tBegin).Nanoseconds()) }()
   730  
   731  	// When we return, make sure to unregister the query so that it doesn't
   732  	// leak. If query.pending if closed, the query will have already been
   733  	// cleaned up, so this will be a no-op.
   734  	defer func() {
   735  		q.mu.Lock()
   736  		if query == q.mu.queries[req.Txn.ID] {
   737  			query.count--
   738  			metrics.QueryWaiting.Dec(1)
   739  			if query.count == 0 {
   740  				delete(q.mu.queries, req.Txn.ID)
   741  			}
   742  		}
   743  		q.mu.Unlock()
   744  	}()
   745  
   746  	log.VEventf(ctx, 2, "waiting on query for %s", req.Txn.ID.Short())
   747  	select {
   748  	case <-ctx.Done():
   749  		// Caller has given up.
   750  		return roachpb.NewError(ctx.Err())
   751  	case <-maxWaitCh:
   752  		return nil
   753  	case <-query.pending:
   754  		return nil
   755  	}
   756  }
   757  
   758  // startQueryPusherTxn starts a goroutine to send QueryTxn requests to
   759  // fetch updates to the pusher's own transaction until the context is
   760  // done or an error occurs while querying. Returns two channels: one
   761  // for updated versions of the pusher transaction, and the other for
   762  // errors encountered while querying. The readyCh parameter is used by
   763  // the caller to signal when the next query to the pusher should be
   764  // sent, and is mostly intended to avoid an extra RPC in the event that
   765  // the QueryTxn returns sufficient information to determine a dependency
   766  // cycle exists and must be broken.
   767  //
   768  // Note that the contents of the pusher transaction including updated
   769  // priority and set of known waiting transactions (dependents) are
   770  // accumulated over iterations and supplied with each successive
   771  // invocation of QueryTxn in order to avoid busy querying.
   772  func (q *Queue) startQueryPusherTxn(
   773  	ctx context.Context, push *waitingPush, readyCh <-chan struct{},
   774  ) (<-chan *roachpb.Transaction, <-chan *roachpb.Error) {
   775  	ch := make(chan *roachpb.Transaction, 1)
   776  	errCh := make(chan *roachpb.Error, 1)
   777  	push.mu.Lock()
   778  	var waitingTxns []uuid.UUID
   779  	if push.mu.dependents != nil {
   780  		waitingTxns = make([]uuid.UUID, 0, len(push.mu.dependents))
   781  		for txnID := range push.mu.dependents {
   782  			waitingTxns = append(waitingTxns, txnID)
   783  		}
   784  	}
   785  	pusher := push.req.PusherTxn.Clone()
   786  	push.mu.Unlock()
   787  
   788  	if err := q.cfg.Stopper.RunAsyncTask(
   789  		ctx, "monitoring pusher txn",
   790  		func(ctx context.Context) {
   791  			// We use a backoff/retry here in case the pusher transaction
   792  			// doesn't yet exist.
   793  			for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); {
   794  				var pErr *roachpb.Error
   795  				var updatedPusher *roachpb.Transaction
   796  				updatedPusher, waitingTxns, pErr = q.queryTxnStatus(
   797  					ctx, pusher.TxnMeta, true, waitingTxns, q.cfg.Clock.Now(),
   798  				)
   799  				if pErr != nil {
   800  					errCh <- pErr
   801  					return
   802  				} else if updatedPusher == nil {
   803  					// No pusher to query; the pusher's record hasn't yet been
   804  					// created. Continue in order to backoff and retry.
   805  					// TODO(nvanbenschoten): we shouldn't hit this case in a 2.2
   806  					// cluster now that QueryTxn requests synthesize
   807  					// transactions from their provided TxnMeta. However, we
   808  					// need to keep the logic while we want to support
   809  					// compatibility with 2.1 nodes. Remove this in 2.3.
   810  					log.Event(ctx, "no pusher found; backing off")
   811  					continue
   812  				}
   813  
   814  				// Update the pending pusher's set of dependents. These accumulate
   815  				// and are used to propagate the transitive set of dependencies for
   816  				// distributed deadlock detection.
   817  				push.mu.Lock()
   818  				if push.mu.dependents == nil {
   819  					push.mu.dependents = map[uuid.UUID]struct{}{}
   820  				}
   821  				for _, txnID := range waitingTxns {
   822  					push.mu.dependents[txnID] = struct{}{}
   823  				}
   824  				push.mu.Unlock()
   825  
   826  				// Send an update of the pusher txn.
   827  				pusher.Update(updatedPusher)
   828  				ch <- pusher
   829  
   830  				// Wait for context cancellation or indication on readyCh that the
   831  				// push waiter requires another query of the pusher txn.
   832  				select {
   833  				case <-ctx.Done():
   834  					errCh <- roachpb.NewError(ctx.Err())
   835  					return
   836  				case <-readyCh:
   837  				}
   838  				// Reset the retry to query again immediately.
   839  				r.Reset()
   840  			}
   841  			errCh <- roachpb.NewError(ctx.Err())
   842  		}); err != nil {
   843  		errCh <- roachpb.NewError(err)
   844  	}
   845  	return ch, errCh
   846  }
   847  
   848  // queryTxnStatus does a "query" push on the specified transaction
   849  // to glean possible changes, such as a higher timestamp and/or
   850  // priority. It turns out this is necessary while a request is waiting
   851  // to push a transaction, as two txns can have circular dependencies
   852  // where both are unable to push because they have different
   853  // information about their own txns.
   854  //
   855  // Returns the updated transaction (or nil if not updated) as well as
   856  // the list of transactions which are waiting on the updated txn.
   857  func (q *Queue) queryTxnStatus(
   858  	ctx context.Context,
   859  	txnMeta enginepb.TxnMeta,
   860  	wait bool,
   861  	dependents []uuid.UUID,
   862  	now hlc.Timestamp,
   863  ) (*roachpb.Transaction, []uuid.UUID, *roachpb.Error) {
   864  	b := &kv.Batch{}
   865  	b.Header.Timestamp = q.cfg.Clock.Now()
   866  	b.AddRawRequest(&roachpb.QueryTxnRequest{
   867  		RequestHeader: roachpb.RequestHeader{
   868  			Key: txnMeta.Key,
   869  		},
   870  		Txn:              txnMeta,
   871  		WaitForUpdate:    wait,
   872  		KnownWaitingTxns: dependents,
   873  	})
   874  	if err := q.cfg.DB.Run(ctx, b); err != nil {
   875  		// TODO(tschottdorf):
   876  		// We shouldn't catch an error here (unless it's from the AbortSpan, in
   877  		// which case we would not get the crucial information that we've been
   878  		// aborted; instead we'll go around thinking we're still PENDING,
   879  		// potentially caught in an infinite loop).  Same issue: we must not use
   880  		// RunWithResponse on this level - we're trying to do internal kv stuff
   881  		// through the public interface. Likely not exercised in tests, so I'd be
   882  		// ok tackling this separately.
   883  		//
   884  		// Scenario:
   885  		// - we're aborted and don't know if we have a read-write conflict
   886  		// - the push above fails and we get a WriteIntentError
   887  		// - we try to update our transaction (right here, and if we don't we might
   888  		// be stuck in a race, that's why we do this - the txn proto we're using
   889  		// might be outdated)
   890  		// - query fails because our home range has the AbortSpan populated we catch
   891  		// a TransactionAbortedError, but with a pending transaction (since we lose
   892  		// the original txn, and you just use the txn we had...)
   893  		//
   894  		// so something is sketchy here, but it should all resolve nicely when we
   895  		// don't use store.db for these internal requests any more.
   896  		return nil, nil, roachpb.NewError(err)
   897  	}
   898  	br := b.RawResponse()
   899  	resp := br.Responses[0].GetInner().(*roachpb.QueryTxnResponse)
   900  	// ID can be nil if no HeartbeatTxn has been sent yet and we're talking to a
   901  	// 2.1 node.
   902  	// TODO(nvanbenschoten): Remove this in 2.3.
   903  	if updatedTxn := &resp.QueriedTxn; updatedTxn.ID != (uuid.UUID{}) {
   904  		return updatedTxn, resp.WaitingTxns, nil
   905  	}
   906  	return nil, nil, nil
   907  }
   908  
   909  // forcePushAbort upgrades the PushTxn request to a "forced" push abort, which
   910  // overrides the normal expiration and priority checks to ensure that it aborts
   911  // the pushee. This mechanism can be used to break deadlocks between conflicting
   912  // transactions.
   913  func (q *Queue) forcePushAbort(
   914  	ctx context.Context, req *roachpb.PushTxnRequest,
   915  ) (*roachpb.PushTxnResponse, *roachpb.Error) {
   916  	log.VEventf(ctx, 1, "force pushing %v to break deadlock", req.PusheeTxn.ID)
   917  	forcePush := *req
   918  	forcePush.Force = true
   919  	forcePush.PushType = roachpb.PUSH_ABORT
   920  	b := &kv.Batch{}
   921  	b.Header.Timestamp = q.cfg.Clock.Now()
   922  	b.AddRawRequest(&forcePush)
   923  	if err := q.cfg.DB.Run(ctx, b); err != nil {
   924  		return nil, b.MustPErr()
   925  	}
   926  	return b.RawResponse().Responses[0].GetPushTxn(), nil
   927  }
   928  
   929  // TrackedTxns returns a (newly minted) set containing the transaction IDs which
   930  // are being tracked (i.e. waited on).
   931  //
   932  // For testing purposes only.
   933  func (q *Queue) TrackedTxns() map[uuid.UUID]struct{} {
   934  	m := make(map[uuid.UUID]struct{})
   935  	q.mu.Lock()
   936  	for k := range q.mu.txns {
   937  		m[k] = struct{}{}
   938  	}
   939  	q.mu.Unlock()
   940  	return m
   941  }