github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/txn_wait_queue_test.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"reflect"
    18  	"regexp"
    19  	"sync/atomic"
    20  	"testing"
    21  	"time"
    22  
    23  	"github.com/cockroachdb/cockroach/pkg/keys"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/storage"
    28  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    29  	"github.com/cockroachdb/cockroach/pkg/testutils"
    30  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    31  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    32  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    33  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    34  	"github.com/cockroachdb/errors"
    35  	"github.com/stretchr/testify/assert"
    36  	"github.com/stretchr/testify/require"
    37  )
    38  
    39  func writeTxnRecord(ctx context.Context, tc *testContext, txn *roachpb.Transaction) error {
    40  	key := keys.TransactionKey(txn.Key, txn.ID)
    41  	return storage.MVCCPutProto(ctx, tc.store.Engine(), nil, key, hlc.Timestamp{}, nil, txn)
    42  }
    43  
    44  // createTxnForPushQueue creates a txn struct and writes a "fake"
    45  // transaction record for it to the underlying engine.
    46  func createTxnForPushQueue(ctx context.Context, tc *testContext) (*roachpb.Transaction, error) {
    47  	txn := newTransaction("txn", roachpb.Key("a"), 1, tc.Clock())
    48  	return txn, writeTxnRecord(ctx, tc, txn)
    49  }
    50  
    51  type RespWithErr struct {
    52  	resp *roachpb.PushTxnResponse
    53  	pErr *roachpb.Error
    54  }
    55  
    56  func checkAllGaugesZero(tc testContext) error {
    57  	m := tc.store.txnWaitMetrics
    58  	if act := m.PusheeWaiting.Value(); act != 0 {
    59  		return errors.Errorf("expected PusheeWaiting to be 0, got %d instead", act)
    60  	}
    61  	if act := m.PusherWaiting.Value(); act != 0 {
    62  		return errors.Errorf("expected PusherWaiting to be 0, got %d instead", act)
    63  	}
    64  	if act := m.QueryWaiting.Value(); act != 0 {
    65  		return errors.Errorf("expected QueryWaiting to be 0, got %d instead", act)
    66  	}
    67  	if act := m.PusherSlow.Value(); act != 0 {
    68  		return errors.Errorf("expected PusherSlow to be 0, got %d instead", act)
    69  	}
    70  	return nil
    71  }
    72  
    73  func TestTxnWaitQueueEnableDisable(t *testing.T) {
    74  	defer leaktest.AfterTest(t)()
    75  	tc := testContext{}
    76  	stopper := stop.NewStopper()
    77  	defer stopper.Stop(context.Background())
    78  	tc.Start(t, stopper)
    79  
    80  	txn, err := createTxnForPushQueue(context.Background(), &tc)
    81  	if err != nil {
    82  		t.Fatal(err)
    83  	}
    84  
    85  	// Queue starts enabled.
    86  	q := tc.repl.concMgr.TxnWaitQueue()
    87  	if !q.IsEnabled() {
    88  		t.Errorf("expected push txn queue is enabled")
    89  	}
    90  	if err := checkAllGaugesZero(tc); err != nil {
    91  		t.Fatal(err.Error())
    92  	}
    93  
    94  	q.EnqueueTxn(txn)
    95  	if _, ok := q.TrackedTxns()[txn.ID]; !ok {
    96  		t.Fatalf("expected pendingTxn to be in txns map after enqueue")
    97  	}
    98  	m := tc.store.txnWaitMetrics
    99  	assert.EqualValues(tc, 1, m.PusheeWaiting.Value())
   100  
   101  	pusher := newTransaction("pusher", roachpb.Key("a"), 1, tc.Clock())
   102  	req := roachpb.PushTxnRequest{
   103  		PushType:  roachpb.PUSH_ABORT,
   104  		PusherTxn: *pusher,
   105  		PusheeTxn: txn.TxnMeta,
   106  	}
   107  
   108  	retCh := make(chan RespWithErr, 1)
   109  	go func() {
   110  		resp, pErr := q.MaybeWaitForPush(context.Background(), &req)
   111  		retCh <- RespWithErr{resp, pErr}
   112  	}()
   113  
   114  	testutils.SucceedsSoon(t, func() error {
   115  		expDeps := []uuid.UUID{pusher.ID}
   116  		if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   117  			return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   118  		}
   119  		if act, exp := m.PusherWaiting.Value(), int64(1); act != exp {
   120  			return errors.Errorf("%d pushers, but want %d", act, exp)
   121  		}
   122  		if act, exp := m.PusheeWaiting.Value(), int64(1); act != exp {
   123  			return errors.Errorf("%d pushees, but want %d", act, exp)
   124  		}
   125  
   126  		return nil
   127  	})
   128  
   129  	// Now disable the queue and make sure the waiter is returned.
   130  	q.Clear(true /* disable */)
   131  	if q.IsEnabled() {
   132  		t.Errorf("expected queue to be disabled")
   133  	}
   134  	if err := checkAllGaugesZero(tc); err != nil {
   135  		t.Fatal(err.Error())
   136  	}
   137  
   138  	respWithErr := <-retCh
   139  	if respWithErr.resp != nil {
   140  		t.Errorf("expected nil response; got %+v", respWithErr.resp)
   141  	}
   142  	if respWithErr.pErr != nil {
   143  		t.Errorf("expected nil err; got %+v", respWithErr.pErr)
   144  	}
   145  
   146  	if deps := q.GetDependents(txn.ID); deps != nil {
   147  		t.Errorf("expected GetDependents to return nil as queue is disabled; got %+v", deps)
   148  	}
   149  
   150  	q.EnqueueTxn(txn)
   151  	if q.IsEnabled() {
   152  		t.Errorf("expected enqueue to silently fail since queue is disabled")
   153  	}
   154  	if err := checkAllGaugesZero(tc); err != nil {
   155  		t.Fatal(err.Error())
   156  	}
   157  
   158  	q.UpdateTxn(context.Background(), txn)
   159  	if len(q.TrackedTxns()) != 0 {
   160  		t.Fatalf("expected update to silently fail since queue is disabled")
   161  	}
   162  
   163  	if resp, pErr := q.MaybeWaitForPush(context.Background(), &req); resp != nil || pErr != nil {
   164  		t.Errorf("expected nil resp and err as queue is disabled; got %+v, %s", resp, pErr)
   165  	}
   166  	if err := checkAllGaugesZero(tc); err != nil {
   167  		t.Fatal(err.Error())
   168  	}
   169  }
   170  
   171  func TestTxnWaitQueueCancel(t *testing.T) {
   172  	defer leaktest.AfterTest(t)()
   173  	tc := testContext{}
   174  	stopper := stop.NewStopper()
   175  	defer stopper.Stop(context.Background())
   176  	tc.Start(t, stopper)
   177  
   178  	txn, err := createTxnForPushQueue(context.Background(), &tc)
   179  	if err != nil {
   180  		t.Fatal(err)
   181  	}
   182  	pusher := newTransaction("pusher", roachpb.Key("a"), 1, tc.Clock())
   183  	req := roachpb.PushTxnRequest{
   184  		PushType:  roachpb.PUSH_ABORT,
   185  		PusherTxn: *pusher,
   186  		PusheeTxn: txn.TxnMeta,
   187  	}
   188  
   189  	q := tc.repl.concMgr.TxnWaitQueue()
   190  	q.Enable()
   191  	if err := checkAllGaugesZero(tc); err != nil {
   192  		t.Fatal(err.Error())
   193  	}
   194  	q.EnqueueTxn(txn)
   195  	m := tc.store.txnWaitMetrics
   196  	assert.EqualValues(tc, 1, m.PusheeWaiting.Value())
   197  	assert.EqualValues(tc, 0, m.PusherWaiting.Value())
   198  
   199  	ctx, cancel := context.WithCancel(context.Background())
   200  	retCh := make(chan RespWithErr, 1)
   201  	go func() {
   202  		resp, pErr := q.MaybeWaitForPush(ctx, &req)
   203  		retCh <- RespWithErr{resp, pErr}
   204  	}()
   205  
   206  	testutils.SucceedsSoon(t, func() error {
   207  		select {
   208  		case rwe := <-retCh:
   209  			t.Fatalf("MaybeWaitForPush terminated prematurely: %+v", rwe)
   210  		default:
   211  		}
   212  		expDeps := []uuid.UUID{pusher.ID}
   213  		if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   214  			return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   215  		}
   216  		if act, exp := m.PusherWaiting.Value(), int64(1); act != exp {
   217  			return errors.Errorf("%d pushers, but want %d", act, exp)
   218  		}
   219  		return nil
   220  	})
   221  	cancel()
   222  
   223  	respWithErr := <-retCh
   224  	if respWithErr.resp != nil {
   225  		t.Errorf("expected nil response; got %+v", respWithErr.resp)
   226  	}
   227  	if !testutils.IsPError(respWithErr.pErr, context.Canceled.Error()) {
   228  		t.Errorf("expected context canceled error; got %v", respWithErr.pErr)
   229  	}
   230  }
   231  
   232  // TestTxnWaitQueueUpdateTxn creates two waiters on a txn and verifies
   233  // both are returned when the txn is updated.
   234  func TestTxnWaitQueueUpdateTxn(t *testing.T) {
   235  	defer leaktest.AfterTest(t)()
   236  	tc := testContext{}
   237  	stopper := stop.NewStopper()
   238  	defer stopper.Stop(context.Background())
   239  	tc.Start(t, stopper)
   240  
   241  	txn, err := createTxnForPushQueue(context.Background(), &tc)
   242  	if err != nil {
   243  		t.Fatal(err)
   244  	}
   245  	pusher1 := newTransaction("pusher1", roachpb.Key("a"), 1, tc.Clock())
   246  	pusher2 := newTransaction("pusher2", roachpb.Key("a"), 1, tc.Clock())
   247  	req1 := roachpb.PushTxnRequest{
   248  		PushType:  roachpb.PUSH_ABORT,
   249  		PusherTxn: *pusher1,
   250  		PusheeTxn: txn.TxnMeta,
   251  	}
   252  	req2 := req1
   253  	req2.PusherTxn = *pusher2
   254  
   255  	q := tc.repl.concMgr.TxnWaitQueue()
   256  	q.Enable()
   257  	q.EnqueueTxn(txn)
   258  	m := tc.store.txnWaitMetrics
   259  	assert.EqualValues(tc, 1, m.PusheeWaiting.Value())
   260  
   261  	retCh := make(chan RespWithErr, 2)
   262  	go func() {
   263  		resp, pErr := q.MaybeWaitForPush(context.Background(), &req1)
   264  		retCh <- RespWithErr{resp, pErr}
   265  	}()
   266  	testutils.SucceedsSoon(t, func() error {
   267  		expDeps := []uuid.UUID{pusher1.ID}
   268  		if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   269  			return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   270  		}
   271  		return nil
   272  	})
   273  	testutils.SucceedsSoon(t, func() error {
   274  		if act, exp := m.PusherWaiting.Value(), int64(1); act != exp {
   275  			return errors.Errorf("%d pushers, but want %d", act, exp)
   276  		}
   277  		if act, exp := m.PusheeWaiting.Value(), int64(1); act != exp {
   278  			return errors.Errorf("%d pushees, but want %d", act, exp)
   279  		}
   280  		if act, exp := m.QueryWaiting.Value(), int64(1); act != exp {
   281  			return errors.Errorf("%d queries, but want %d", act, exp)
   282  		}
   283  		return nil
   284  	})
   285  
   286  	go func() {
   287  		resp, pErr := q.MaybeWaitForPush(context.Background(), &req2)
   288  		retCh <- RespWithErr{resp, pErr}
   289  	}()
   290  	testutils.SucceedsSoon(t, func() error {
   291  		expDeps := []uuid.UUID{pusher1.ID, pusher2.ID}
   292  		if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   293  			return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   294  		}
   295  		if act, exp := m.PusherWaiting.Value(), int64(2); act != exp {
   296  			return errors.Errorf("%d pushers, but want %d", act, exp)
   297  		}
   298  		if act, exp := m.PusheeWaiting.Value(), int64(1); act != exp {
   299  			return errors.Errorf("%d pushees, but want %d", act, exp)
   300  		}
   301  		if act, exp := m.QueryWaiting.Value(), int64(2); act != exp {
   302  			return errors.Errorf("%d queries, but want %d", act, exp)
   303  		}
   304  		return nil
   305  	})
   306  
   307  	updatedTxn := *txn
   308  	updatedTxn.Status = roachpb.COMMITTED
   309  	q.UpdateTxn(context.Background(), &updatedTxn)
   310  	testutils.SucceedsSoon(tc.TB, func() error {
   311  		return checkAllGaugesZero(tc)
   312  	})
   313  
   314  	for i := 0; i < 2; i++ {
   315  		respWithErr := <-retCh
   316  		if respWithErr.resp == nil || respWithErr.resp.PusheeTxn.Status != roachpb.COMMITTED {
   317  			t.Errorf("expected committed txn response; got %+v, err=%v", respWithErr.resp, respWithErr.pErr)
   318  		}
   319  	}
   320  }
   321  
   322  // TestTxnWaitQueueTxnSilentlyCompletes creates a waiter on a txn and verifies
   323  // that the waiter is eventually unblocked when the txn commits but UpdateTxn is
   324  // not called.
   325  //
   326  // This simulates the following observed sequence of events. A transaction, TA,
   327  // writes a key K. Another transaction, TB, attempts to read K. It notices the
   328  // intent on K and sends a PushTxnRequest. The PushTxnRequest fails and returns
   329  // a TransactionPushError. Before the replica handles the TransactionPushError,
   330  // TA commits and the replica fully processes its EndTxnRequest. Only then does
   331  // the replica notice the TransactionPushError and put TB's PushTxnRequest into
   332  // TA's wait queue. Updates to TA will never be sent via Queue.UpdateTxn,
   333  // because Queue.UpdateTxn was already called when the EndTxnRequest was
   334  // processed, before TB's PushTxnRequest was in TA's wait queue.
   335  //
   336  // This sequence of events was previously mishandled when TA's transaction
   337  // record was not immediately cleaned up, e.g. because it had non-local intents.
   338  // The wait queue would continually poll TA's transaction record, notice it
   339  // still existed, and continue waiting. In production, this meant that the
   340  // PushTxnRequest would get stuck waiting out the full TxnLivenessThreshold for
   341  // the transaction record to expire. In unit tests, where the clock might never
   342  // be advanced, the PushTxnRequest could get stuck forever.
   343  func TestTxnWaitQueueTxnSilentlyCompletes(t *testing.T) {
   344  	defer leaktest.AfterTest(t)()
   345  	// This test relies on concurrently waiting for a value to change in the
   346  	// underlying engine(s). Since the teeing engine does not respond well to
   347  	// value mismatches, whether transient or permanent, skip this test if the
   348  	// teeing engine is being used. See
   349  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
   350  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
   351  		t.Skip("disabled on teeing engine")
   352  	}
   353  	tc := testContext{}
   354  	ctx := context.Background()
   355  	stopper := stop.NewStopper()
   356  	defer stopper.Stop(ctx)
   357  	tc.Start(t, stopper)
   358  
   359  	txn, err := createTxnForPushQueue(ctx, &tc)
   360  	if err != nil {
   361  		t.Fatal(err)
   362  	}
   363  	pusher := newTransaction("pusher", roachpb.Key("a"), 1, tc.Clock())
   364  	req := &roachpb.PushTxnRequest{
   365  		RequestHeader: roachpb.RequestHeader{
   366  			Key: txn.Key,
   367  		},
   368  		PushType:  roachpb.PUSH_ABORT,
   369  		PusherTxn: *pusher,
   370  		PusheeTxn: txn.TxnMeta,
   371  	}
   372  
   373  	q := tc.repl.concMgr.TxnWaitQueue()
   374  	q.Enable()
   375  	q.EnqueueTxn(txn)
   376  
   377  	retCh := make(chan RespWithErr, 2)
   378  	go func() {
   379  		resp, pErr := q.MaybeWaitForPush(context.Background(), req)
   380  		retCh <- RespWithErr{resp, pErr}
   381  	}()
   382  
   383  	m := tc.store.txnWaitMetrics
   384  	testutils.SucceedsSoon(t, func() error {
   385  		expDeps := []uuid.UUID{pusher.ID}
   386  		if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   387  			return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   388  		}
   389  		if act, exp := m.PusherWaiting.Value(), int64(1); act != exp {
   390  			return errors.Errorf("%d pushers, but want %d", act, exp)
   391  		}
   392  		if act, exp := m.PusheeWaiting.Value(), int64(1); act != exp {
   393  			return errors.Errorf("%d pushees, but want %d", act, exp)
   394  		}
   395  		if act, exp := m.QueryWaiting.Value(), int64(1); act != exp {
   396  			return errors.Errorf("%d queries, but want %d", act, exp)
   397  		}
   398  		return nil
   399  	})
   400  
   401  	txn.Status = roachpb.COMMITTED
   402  	if err := writeTxnRecord(ctx, &tc, txn); err != nil {
   403  		t.Fatal(err)
   404  	}
   405  
   406  	// Skip calling q.UpdateTxn to test that the wait queue periodically polls
   407  	// txn's record and notices when it is no longer pending.
   408  
   409  	respWithErr := <-retCh
   410  	if respWithErr.resp == nil || respWithErr.resp.PusheeTxn.Status != roachpb.COMMITTED {
   411  		t.Errorf("expected committed txn response; got %+v, err=%v", respWithErr.resp, respWithErr.pErr)
   412  	}
   413  	testutils.SucceedsSoon(t, func() error {
   414  		if act, exp := m.PusherWaiting.Value(), int64(1); act != exp {
   415  			return errors.Errorf("%d pushers, but want %d", act, exp)
   416  		}
   417  		if act, exp := m.PusheeWaiting.Value(), int64(1); act != exp {
   418  			return errors.Errorf("%d pushees, but want %d", act, exp)
   419  		}
   420  		if act, exp := m.QueryWaiting.Value(), int64(0); act != exp {
   421  			return errors.Errorf("%d queries, but want %d", act, exp)
   422  		}
   423  		return nil
   424  	})
   425  }
   426  
   427  // TestTxnWaitQueueUpdateNotPushedTxn verifies that no PushTxnResponse
   428  // is returned in the event that the pushee txn only has its timestamp
   429  // updated.
   430  func TestTxnWaitQueueUpdateNotPushedTxn(t *testing.T) {
   431  	defer leaktest.AfterTest(t)()
   432  	tc := testContext{}
   433  	stopper := stop.NewStopper()
   434  	defer stopper.Stop(context.Background())
   435  	tc.Start(t, stopper)
   436  
   437  	txn, err := createTxnForPushQueue(context.Background(), &tc)
   438  	if err != nil {
   439  		t.Fatal(err)
   440  	}
   441  	pusher := newTransaction("pusher", roachpb.Key("a"), 1, tc.Clock())
   442  	req := roachpb.PushTxnRequest{
   443  		PushType:  roachpb.PUSH_ABORT,
   444  		PusherTxn: *pusher,
   445  		PusheeTxn: txn.TxnMeta,
   446  	}
   447  
   448  	q := tc.repl.concMgr.TxnWaitQueue()
   449  	q.Enable()
   450  	q.EnqueueTxn(txn)
   451  
   452  	retCh := make(chan RespWithErr, 1)
   453  	go func() {
   454  		resp, pErr := q.MaybeWaitForPush(context.Background(), &req)
   455  		retCh <- RespWithErr{resp, pErr}
   456  	}()
   457  
   458  	testutils.SucceedsSoon(t, func() error {
   459  		expDeps := []uuid.UUID{pusher.ID}
   460  		if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   461  			return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   462  		}
   463  		return nil
   464  	})
   465  
   466  	updatedTxn := *txn
   467  	updatedTxn.WriteTimestamp = txn.WriteTimestamp.Add(1, 0)
   468  	q.UpdateTxn(context.Background(), &updatedTxn)
   469  
   470  	respWithErr := <-retCh
   471  	if respWithErr.resp != nil {
   472  		t.Errorf("on non-committed txn update, expected nil response; got %+v", respWithErr.resp)
   473  	}
   474  	if respWithErr.pErr != nil {
   475  		t.Errorf("expected nil error; got %s", respWithErr.pErr)
   476  	}
   477  	testutils.SucceedsSoon(tc.TB, func() error {
   478  		return checkAllGaugesZero(tc)
   479  	})
   480  }
   481  
   482  // TestTxnWaitQueuePusheeExpires verifies that just one pusher is
   483  // returned when the pushee's txn may have expired.
   484  func TestTxnWaitQueuePusheeExpires(t *testing.T) {
   485  	defer leaktest.AfterTest(t)()
   486  	var queryTxnCount int32
   487  
   488  	manual := hlc.NewManualClock(123)
   489  	clock := hlc.NewClock(manual.UnixNano, time.Nanosecond)
   490  	txn := newTransaction("txn", roachpb.Key("a"), 1, clock)
   491  	// Move the clock forward so that when the PushTxn is sent, the txn appears
   492  	// expired.
   493  	manual.Set(txnwait.TxnExpiration(txn).WallTime)
   494  
   495  	tc := testContext{}
   496  	tsc := TestStoreConfig(clock)
   497  	tsc.TestingKnobs.EvalKnobs.TestingEvalFilter =
   498  		func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
   499  			if qtReq, ok := filterArgs.Req.(*roachpb.QueryTxnRequest); ok && bytes.Equal(qtReq.Txn.Key, txn.Key) {
   500  				atomic.AddInt32(&queryTxnCount, 1)
   501  			}
   502  			return nil
   503  		}
   504  	stopper := stop.NewStopper()
   505  	defer stopper.Stop(context.Background())
   506  	tc.StartWithStoreConfig(t, stopper, tsc)
   507  
   508  	pusher1 := newTransaction("pusher1", roachpb.Key("a"), 1, tc.Clock())
   509  	pusher2 := newTransaction("pusher2", roachpb.Key("a"), 1, tc.Clock())
   510  	req1 := roachpb.PushTxnRequest{
   511  		PushType:  roachpb.PUSH_ABORT,
   512  		PusherTxn: *pusher1,
   513  		PusheeTxn: txn.TxnMeta,
   514  	}
   515  	req2 := req1
   516  	req2.PusherTxn = *pusher2
   517  
   518  	// Create a "fake" txn record.
   519  	if err := writeTxnRecord(context.Background(), &tc, txn); err != nil {
   520  		t.Fatal(err)
   521  	}
   522  
   523  	q := tc.repl.concMgr.TxnWaitQueue()
   524  	q.Enable()
   525  	q.EnqueueTxn(txn)
   526  
   527  	retCh := make(chan RespWithErr, 2)
   528  	go func() {
   529  		resp, pErr := q.MaybeWaitForPush(context.Background(), &req1)
   530  		retCh <- RespWithErr{resp, pErr}
   531  	}()
   532  	testutils.SucceedsSoon(t, func() error {
   533  		expDeps := []uuid.UUID{pusher1.ID}
   534  		if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   535  			return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   536  		}
   537  		return nil
   538  	})
   539  
   540  	go func() {
   541  		resp, pErr := q.MaybeWaitForPush(context.Background(), &req2)
   542  		retCh <- RespWithErr{resp, pErr}
   543  	}()
   544  	testutils.SucceedsSoon(t, func() error {
   545  		expDeps := []uuid.UUID{pusher1.ID, pusher2.ID}
   546  		if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   547  			return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   548  		}
   549  		return nil
   550  	})
   551  
   552  	for i := 0; i < 2; i++ {
   553  		respWithErr := <-retCh
   554  		if respWithErr.resp != nil {
   555  			t.Errorf("expected nil txn response; got %+v", respWithErr.resp)
   556  		}
   557  		if respWithErr.pErr != nil {
   558  			t.Errorf("expected nil error; got %s", respWithErr.pErr)
   559  		}
   560  	}
   561  
   562  	m := tc.store.txnWaitMetrics
   563  	testutils.SucceedsSoon(t, func() error {
   564  		if act, exp := m.PusherWaiting.Value(), int64(2); act != exp {
   565  			return errors.Errorf("%d pushers, but want %d", act, exp)
   566  		}
   567  		if act, exp := m.PusheeWaiting.Value(), int64(1); act != exp {
   568  			return errors.Errorf("%d pushees, but want %d", act, exp)
   569  		}
   570  		if act, exp := m.QueryWaiting.Value(), int64(0); act != exp {
   571  			return errors.Errorf("%d queries, but want %d", act, exp)
   572  		}
   573  		return nil
   574  	})
   575  	if a, minExpected := atomic.LoadInt32(&queryTxnCount), int32(2); a < minExpected {
   576  		t.Errorf("expected no fewer than %d query txns; got %d", minExpected, a)
   577  	}
   578  }
   579  
   580  // TestTxnWaitQueuePusherUpdate verifies that the pusher's status is
   581  // periodically updated and will notice if the pusher has been aborted.
   582  func TestTxnWaitQueuePusherUpdate(t *testing.T) {
   583  	defer leaktest.AfterTest(t)()
   584  
   585  	testutils.RunTrueAndFalse(t, "txnRecordExists", func(t *testing.T, txnRecordExists bool) {
   586  		// Test with the pusher txn record below the pusher's expected epoch, at
   587  		// the pusher's expected epoch, and above the pusher's expected epoch.
   588  		// Regardless of which epoch the transaction record is written at, if
   589  		// it is marked as ABORTED, it should terminate the push.
   590  		pushEpoch := enginepb.TxnEpoch(2)
   591  		for _, c := range []struct {
   592  			name        string
   593  			recordEpoch enginepb.TxnEpoch
   594  		}{
   595  			{"below", pushEpoch - 1},
   596  			{"equal", pushEpoch},
   597  			{"above", pushEpoch + 1},
   598  		} {
   599  			t.Run(fmt.Sprintf("recordEpoch=%s", c.name), func(t *testing.T) {
   600  				tc := testContext{}
   601  				stopper := stop.NewStopper()
   602  				defer stopper.Stop(context.Background())
   603  				tc.Start(t, stopper)
   604  
   605  				txn, err := createTxnForPushQueue(context.Background(), &tc)
   606  				if err != nil {
   607  					t.Fatal(err)
   608  				}
   609  				var pusher *roachpb.Transaction
   610  				if txnRecordExists {
   611  					pusher, err = createTxnForPushQueue(context.Background(), &tc)
   612  					if err != nil {
   613  						t.Fatal(err)
   614  					}
   615  				} else {
   616  					pusher = newTransaction("pusher", roachpb.Key("a"), 1, tc.Clock())
   617  				}
   618  				pusher.Epoch = pushEpoch
   619  
   620  				req := roachpb.PushTxnRequest{
   621  					PushType:  roachpb.PUSH_ABORT,
   622  					PusherTxn: *pusher,
   623  					PusheeTxn: txn.TxnMeta,
   624  				}
   625  
   626  				q := tc.repl.concMgr.TxnWaitQueue()
   627  				q.Enable()
   628  				q.EnqueueTxn(txn)
   629  
   630  				retCh := make(chan RespWithErr, 1)
   631  				go func() {
   632  					resp, pErr := q.MaybeWaitForPush(context.Background(), &req)
   633  					retCh <- RespWithErr{resp, pErr}
   634  				}()
   635  
   636  				testutils.SucceedsSoon(t, func() error {
   637  					expDeps := []uuid.UUID{pusher.ID}
   638  					if deps := q.GetDependents(txn.ID); !reflect.DeepEqual(deps, expDeps) {
   639  						return errors.Errorf("expected GetDependents %+v; got %+v", expDeps, deps)
   640  					}
   641  					return nil
   642  				})
   643  
   644  				// If the record doesn't exist yet, give the push queue enough
   645  				// time to query the missing record and notice.
   646  				if !txnRecordExists {
   647  					time.Sleep(10 * time.Millisecond)
   648  				}
   649  
   650  				// Update txn on disk with status ABORTED.
   651  				pusherUpdate := *pusher
   652  				pusherUpdate.Epoch = c.recordEpoch
   653  				pusherUpdate.Status = roachpb.ABORTED
   654  				if err := writeTxnRecord(context.Background(), &tc, &pusherUpdate); err != nil {
   655  					t.Fatal(err)
   656  				}
   657  				q.UpdateTxn(context.Background(), &pusherUpdate)
   658  
   659  				respWithErr := <-retCh
   660  				if respWithErr.resp != nil {
   661  					t.Errorf("expected nil response; got %+v", respWithErr.resp)
   662  				}
   663  				expErr := "TransactionAbortedError(ABORT_REASON_PUSHER_ABORTED)"
   664  				if !testutils.IsPError(respWithErr.pErr, regexp.QuoteMeta(expErr)) {
   665  					t.Errorf("expected %s; got %v", expErr, respWithErr.pErr)
   666  				}
   667  
   668  				m := tc.store.txnWaitMetrics
   669  				testutils.SucceedsSoon(t, func() error {
   670  					if act, exp := m.PusherWaiting.Value(), int64(1); act != exp {
   671  						return errors.Errorf("%d pushers, but want %d", act, exp)
   672  					}
   673  					if act, exp := m.PusheeWaiting.Value(), int64(1); act != exp {
   674  						return errors.Errorf("%d pushees, but want %d", act, exp)
   675  					}
   676  					if act, exp := m.QueryWaiting.Value(), int64(0); act != exp {
   677  						return errors.Errorf("%d queries, but want %d", act, exp)
   678  					}
   679  					return nil
   680  				})
   681  			})
   682  		}
   683  	})
   684  }
   685  
   686  type ReqWithRespAndErr struct {
   687  	req  *roachpb.PushTxnRequest
   688  	resp *roachpb.PushTxnResponse
   689  	pErr *roachpb.Error
   690  }
   691  
   692  // TestTxnWaitQueueDependencyCycle verifies that if txn A pushes txn B
   693  // pushes txn C which in turn is pushing txn A, the cycle will be
   694  // detected and broken by a higher priority pusher.
   695  func TestTxnWaitQueueDependencyCycle(t *testing.T) {
   696  	defer leaktest.AfterTest(t)()
   697  	tc := testContext{}
   698  	stopper := stop.NewStopper()
   699  	defer stopper.Stop(context.Background())
   700  	tc.Start(t, stopper)
   701  
   702  	txnA, err := createTxnForPushQueue(context.Background(), &tc)
   703  	if err != nil {
   704  		t.Fatal(err)
   705  	}
   706  	txnB, err := createTxnForPushQueue(context.Background(), &tc)
   707  	if err != nil {
   708  		t.Fatal(err)
   709  	}
   710  	txnC, err := createTxnForPushQueue(context.Background(), &tc)
   711  	if err != nil {
   712  		t.Fatal(err)
   713  	}
   714  
   715  	reqA := &roachpb.PushTxnRequest{
   716  		RequestHeader: roachpb.RequestHeader{
   717  			Key: txnB.Key,
   718  		},
   719  		PushType:  roachpb.PUSH_ABORT,
   720  		PusherTxn: *txnA,
   721  		PusheeTxn: txnB.TxnMeta,
   722  	}
   723  	reqB := &roachpb.PushTxnRequest{
   724  		RequestHeader: roachpb.RequestHeader{
   725  			Key: txnC.Key,
   726  		},
   727  		PushType:  roachpb.PUSH_ABORT,
   728  		PusherTxn: *txnB,
   729  		PusheeTxn: txnC.TxnMeta,
   730  	}
   731  	reqC := &roachpb.PushTxnRequest{
   732  		RequestHeader: roachpb.RequestHeader{
   733  			Key: txnA.Key,
   734  		},
   735  		PushType:  roachpb.PUSH_ABORT,
   736  		PusherTxn: *txnC,
   737  		PusheeTxn: txnA.TxnMeta,
   738  	}
   739  
   740  	q := tc.repl.concMgr.TxnWaitQueue()
   741  	q.Enable()
   742  
   743  	ctx, cancel := context.WithCancel(context.Background())
   744  	defer cancel()
   745  	for _, txn := range []*roachpb.Transaction{txnA, txnB, txnC} {
   746  		q.EnqueueTxn(txn)
   747  	}
   748  	m := tc.store.txnWaitMetrics
   749  	assert.EqualValues(tc, 0, m.DeadlocksTotal.Count())
   750  
   751  	reqs := []*roachpb.PushTxnRequest{reqA, reqB, reqC}
   752  	retCh := make(chan ReqWithRespAndErr, len(reqs))
   753  	for _, req := range reqs {
   754  		go func(req *roachpb.PushTxnRequest) {
   755  			resp, pErr := q.MaybeWaitForPush(ctx, req)
   756  			retCh <- ReqWithRespAndErr{req, resp, pErr}
   757  		}(req)
   758  	}
   759  
   760  	// Wait for first request to finish, which should break the dependency cycle
   761  	// by performing a force push abort. This will allow all other requests to
   762  	// proceed. At least one txn will be aborted by another txn, although it's
   763  	// possible that up to two are in the case that the deadlock is detected by
   764  	// multiple txns concurrently.
   765  	var pushed bool
   766  	for i := 0; i < len(reqs); i++ {
   767  		ret := <-retCh
   768  		if ret.pErr != nil {
   769  			if !testutils.IsPError(ret.pErr, context.Canceled.Error()) {
   770  				require.Regexp(t, `TransactionAbortedError\(ABORT_REASON_PUSHER_ABORTED\)`, ret.pErr)
   771  			}
   772  		} else {
   773  			pushed = true
   774  			require.NotNil(t, ret.resp)
   775  			require.Equal(t, roachpb.ABORTED, ret.resp.PusheeTxn.Status)
   776  
   777  			// Cancel the pushers' context after the deadlock is initially broken.
   778  			cancel()
   779  		}
   780  	}
   781  	require.True(t, pushed)
   782  	require.GreaterOrEqual(t, m.DeadlocksTotal.Count(), int64(1))
   783  }
   784  
   785  // TestTxnWaitQueueDependencyCycleWithPriorityInversion verifies that
   786  // priority inversions between two dependent transactions are noticed
   787  // and the dependency is appropriately broken.
   788  func TestTxnWaitQueueDependencyCycleWithPriorityInversion(t *testing.T) {
   789  	defer leaktest.AfterTest(t)()
   790  	tc := testContext{}
   791  	stopper := stop.NewStopper()
   792  	defer stopper.Stop(context.Background())
   793  	tc.Start(t, stopper)
   794  
   795  	// Create txnA with a lower priority so it won't think it could push
   796  	// txnB without updating its priority.
   797  	txnA := newTransaction("txn", roachpb.Key("a"), -1, tc.Clock())
   798  	// However, write an "updated" txnA with higher priority, which it
   799  	// will need to read via a QueryTxn request in order to realize it
   800  	// can in fact break the deadlock.
   801  	updatedTxnA := *txnA
   802  	updatedTxnA.Priority = 3
   803  	if err := writeTxnRecord(context.Background(), &tc, &updatedTxnA); err != nil {
   804  		t.Fatal(err)
   805  	}
   806  	// Create txnB with priority=2, so txnA won't think it can push, but
   807  	// when we set up txnB as the pusher, the request will include txnA's
   808  	// updated priority, making txnB think it can't break a deadlock.
   809  	txnB := newTransaction("txn", roachpb.Key("a"), -2, tc.Clock())
   810  	if err := writeTxnRecord(context.Background(), &tc, txnB); err != nil {
   811  		t.Fatal(err)
   812  	}
   813  
   814  	reqA := &roachpb.PushTxnRequest{
   815  		RequestHeader: roachpb.RequestHeader{
   816  			Key: txnB.Key,
   817  		},
   818  		PushType:  roachpb.PUSH_ABORT,
   819  		PusherTxn: *txnA,
   820  		PusheeTxn: txnB.TxnMeta,
   821  	}
   822  	reqB := &roachpb.PushTxnRequest{
   823  		RequestHeader: roachpb.RequestHeader{
   824  			Key: txnA.Key,
   825  		},
   826  		PushType:  roachpb.PUSH_ABORT,
   827  		PusherTxn: *txnB,
   828  		PusheeTxn: updatedTxnA.TxnMeta,
   829  	}
   830  
   831  	q := tc.repl.concMgr.TxnWaitQueue()
   832  	q.Enable()
   833  
   834  	for _, txn := range []*roachpb.Transaction{txnA, txnB} {
   835  		q.EnqueueTxn(txn)
   836  	}
   837  	m := tc.store.txnWaitMetrics
   838  	assert.EqualValues(tc, 0, m.DeadlocksTotal.Count())
   839  
   840  	reqs := []*roachpb.PushTxnRequest{reqA, reqB}
   841  	retCh := make(chan ReqWithRespAndErr, len(reqs))
   842  	for _, req := range reqs {
   843  		go func(req *roachpb.PushTxnRequest) {
   844  			resp, pErr := q.MaybeWaitForPush(context.Background(), req)
   845  			retCh <- ReqWithRespAndErr{req, resp, pErr}
   846  		}(req)
   847  	}
   848  
   849  	// Wait for the requests to finish. reqA should break the dependency
   850  	// cycle by force pushing. reqB should notice that it was aborted.
   851  	for i := 0; i < len(reqs); i++ {
   852  		ret := <-retCh
   853  		switch ret.req {
   854  		case reqA:
   855  			require.Nil(t, ret.pErr)
   856  			require.NotNil(t, ret.resp)
   857  			require.Equal(t, txnB.ID, ret.resp.PusheeTxn.ID)
   858  			require.Equal(t, roachpb.ABORTED, ret.resp.PusheeTxn.Status)
   859  		case reqB:
   860  			require.Regexp(t, `TransactionAbortedError\(ABORT_REASON_PUSHER_ABORTED\)`, ret.pErr)
   861  		default:
   862  			t.Fatal("unexpected")
   863  		}
   864  	}
   865  	require.EqualValues(t, 1, m.DeadlocksTotal.Count())
   866  }