github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/concurrency/concurrency_manager_test.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/concurrency/concurrency_manager_test.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package concurrency_test
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"io/ioutil"
    18  	"reflect"
    19  	"runtime"
    20  	"sort"
    21  	"strconv"
    22  	"strings"
    23  	"sync/atomic"
    24  	"testing"
    25  	"time"
    26  
    27  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
    29  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
    30  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/intentresolver"
    31  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    32  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait"
    33  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    34  	clustersettings "github.com/cockroachdb/cockroach/pkg/settings/cluster"
    35  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    36  	"github.com/cockroachdb/cockroach/pkg/testutils"
    37  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    38  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    39  	"github.com/cockroachdb/cockroach/pkg/util/log"
    40  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    42  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    43  	"github.com/cockroachdb/datadriven"
    44  	"github.com/cockroachdb/errors"
    45  	"github.com/maruel/panicparse/stack"
    46  	"github.com/petermattis/goid"
    47  )
    48  
    49  // TestConcurrencyManagerBasic verifies that sequences of requests interacting
    50  // with a concurrency manager perform properly.
    51  //
    52  // The input files use the following DSL:
    53  //
    54  // new-txn      name=<txn-name> ts=<int>[,<int>] epoch=<int> [maxts=<int>[,<int>]]
    55  // new-request  name=<req-name> txn=<txn-name>|none ts=<int>[,<int>] [priority] [consistency]
    56  //   <proto-name> [<field-name>=<field-value>...] (hint: see scanSingleRequest)
    57  // sequence     req=<req-name>
    58  // finish       req=<req-name>
    59  //
    60  // handle-write-intent-error  req=<req-name> txn=<txn-name> key=<key>
    61  // handle-txn-push-error      req=<req-name> txn=<txn-name> key=<key>  TODO(nvanbenschoten): implement this
    62  //
    63  // on-lock-acquired  req=<req-name> key=<key> [seq=<seq>] [dur=r|u]
    64  // on-lock-updated   req=<req-name> txn=<txn-name> key=<key> status=[committed|aborted|pending] [ts=<int>[,<int>]]
    65  // on-txn-updated    txn=<txn-name> status=[committed|aborted|pending] [ts=<int>[,<int>]]
    66  //
    67  // on-lease-updated  leaseholder=<bool>
    68  // on-split
    69  // on-merge
    70  // on-snapshot-applied
    71  //
    72  // debug-latch-manager
    73  // debug-lock-table
    74  // debug-disable-txn-pushes
    75  // reset
    76  //
    77  func TestConcurrencyManagerBasic(t *testing.T) {
    78  	defer leaktest.AfterTest(t)()
    79  
    80  	datadriven.Walk(t, "testdata/concurrency_manager", func(t *testing.T, path string) {
    81  		c := newCluster()
    82  		c.enableTxnPushes()
    83  		m := concurrency.NewManager(c.makeConfig())
    84  		m.OnRangeLeaseUpdated(true /* isLeaseholder */) // enable
    85  		c.m = m
    86  		mon := newMonitor()
    87  		datadriven.RunTest(t, path, func(t *testing.T, d *datadriven.TestData) string {
    88  			switch d.Cmd {
    89  			case "new-txn":
    90  				var txnName string
    91  				d.ScanArgs(t, "name", &txnName)
    92  				ts := scanTimestamp(t, d)
    93  
    94  				var epoch int
    95  				d.ScanArgs(t, "epoch", &epoch)
    96  
    97  				maxTS := ts
    98  				if d.HasArg("maxts") {
    99  					maxTS = scanTimestampWithName(t, d, "maxts")
   100  				}
   101  
   102  				txn, ok := c.txnsByName[txnName]
   103  				var id uuid.UUID
   104  				if ok {
   105  					id = txn.ID
   106  				} else {
   107  					id = c.newTxnID()
   108  				}
   109  				txn = &roachpb.Transaction{
   110  					TxnMeta: enginepb.TxnMeta{
   111  						ID:             id,
   112  						Epoch:          enginepb.TxnEpoch(epoch),
   113  						WriteTimestamp: ts,
   114  						MinTimestamp:   ts,
   115  						Priority:       1, // not min or max
   116  					},
   117  					ReadTimestamp: ts,
   118  					MaxTimestamp:  maxTS,
   119  				}
   120  				txn.UpdateObservedTimestamp(c.nodeDesc.NodeID, ts)
   121  				c.registerTxn(txnName, txn)
   122  				return ""
   123  
   124  			case "new-request":
   125  				var reqName string
   126  				d.ScanArgs(t, "name", &reqName)
   127  				if _, ok := c.requestsByName[reqName]; ok {
   128  					d.Fatalf(t, "duplicate request: %s", reqName)
   129  				}
   130  
   131  				var txnName string
   132  				d.ScanArgs(t, "txn", &txnName)
   133  				txn, ok := c.txnsByName[txnName]
   134  				if !ok && txnName != "none" {
   135  					d.Fatalf(t, "unknown txn %s", txnName)
   136  				}
   137  
   138  				ts := scanTimestamp(t, d)
   139  				if txn != nil {
   140  					txn = txn.Clone()
   141  					txn.ReadTimestamp = ts
   142  					txn.WriteTimestamp = ts
   143  				}
   144  
   145  				readConsistency := roachpb.CONSISTENT
   146  				if d.HasArg("inconsistent") {
   147  					readConsistency = roachpb.INCONSISTENT
   148  				}
   149  
   150  				// Each roachpb.Request is provided on an indented line.
   151  				var reqs []roachpb.Request
   152  				singleReqLines := strings.Split(d.Input, "\n")
   153  				for _, line := range singleReqLines {
   154  					req := scanSingleRequest(t, d, line, c.txnsByName)
   155  					reqs = append(reqs, req)
   156  				}
   157  				reqUnions := make([]roachpb.RequestUnion, len(reqs))
   158  				for i, req := range reqs {
   159  					reqUnions[i].MustSetInner(req)
   160  				}
   161  				latchSpans, lockSpans := c.collectSpans(t, txn, ts, reqs)
   162  
   163  				c.requestsByName[reqName] = concurrency.Request{
   164  					Txn:       txn,
   165  					Timestamp: ts,
   166  					// TODO(nvanbenschoten): test Priority
   167  					ReadConsistency: readConsistency,
   168  					Requests:        reqUnions,
   169  					LatchSpans:      latchSpans,
   170  					LockSpans:       lockSpans,
   171  				}
   172  				return ""
   173  
   174  			case "sequence":
   175  				var reqName string
   176  				d.ScanArgs(t, "req", &reqName)
   177  				req, ok := c.requestsByName[reqName]
   178  				if !ok {
   179  					d.Fatalf(t, "unknown request: %s", reqName)
   180  				}
   181  
   182  				c.mu.Lock()
   183  				prev := c.guardsByReqName[reqName]
   184  				delete(c.guardsByReqName, reqName)
   185  				c.mu.Unlock()
   186  
   187  				opName := fmt.Sprintf("sequence %s", reqName)
   188  				mon.runAsync(opName, func(ctx context.Context) {
   189  					guard, resp, err := m.SequenceReq(ctx, prev, req)
   190  					if err != nil {
   191  						log.Eventf(ctx, "sequencing complete, returned error: %v", err)
   192  					} else if resp != nil {
   193  						log.Eventf(ctx, "sequencing complete, returned response: %v", resp)
   194  					} else if guard != nil {
   195  						log.Event(ctx, "sequencing complete, returned guard")
   196  						c.mu.Lock()
   197  						c.guardsByReqName[reqName] = guard
   198  						c.mu.Unlock()
   199  					} else {
   200  						log.Event(ctx, "sequencing complete, returned no guard")
   201  					}
   202  				})
   203  				return c.waitAndCollect(t, mon)
   204  
   205  			case "finish":
   206  				var reqName string
   207  				d.ScanArgs(t, "req", &reqName)
   208  				guard, ok := c.guardsByReqName[reqName]
   209  				if !ok {
   210  					d.Fatalf(t, "unknown request: %s", reqName)
   211  				}
   212  
   213  				opName := fmt.Sprintf("finish %s", reqName)
   214  				mon.runSync(opName, func(ctx context.Context) {
   215  					log.Event(ctx, "finishing request")
   216  					m.FinishReq(guard)
   217  					c.mu.Lock()
   218  					delete(c.guardsByReqName, reqName)
   219  					c.mu.Unlock()
   220  				})
   221  				return c.waitAndCollect(t, mon)
   222  
   223  			case "handle-write-intent-error":
   224  				var reqName string
   225  				d.ScanArgs(t, "req", &reqName)
   226  				prev, ok := c.guardsByReqName[reqName]
   227  				if !ok {
   228  					d.Fatalf(t, "unknown request: %s", reqName)
   229  				}
   230  
   231  				// Each roachpb.Intent is provided on an indented line.
   232  				var intents []roachpb.Intent
   233  				singleReqLines := strings.Split(d.Input, "\n")
   234  				for _, line := range singleReqLines {
   235  					var err error
   236  					d.Cmd, d.CmdArgs, err = datadriven.ParseLine(line)
   237  					if err != nil {
   238  						d.Fatalf(t, "error parsing single intent: %v", err)
   239  					}
   240  					if d.Cmd != "intent" {
   241  						d.Fatalf(t, "expected \"intent\", found %s", d.Cmd)
   242  					}
   243  
   244  					var txnName string
   245  					d.ScanArgs(t, "txn", &txnName)
   246  					txn, ok := c.txnsByName[txnName]
   247  					if !ok {
   248  						d.Fatalf(t, "unknown txn %s", txnName)
   249  					}
   250  
   251  					var key string
   252  					d.ScanArgs(t, "key", &key)
   253  
   254  					intents = append(intents, roachpb.MakeIntent(&txn.TxnMeta, roachpb.Key(key)))
   255  				}
   256  
   257  				opName := fmt.Sprintf("handle write intent error %s", reqName)
   258  				mon.runAsync(opName, func(ctx context.Context) {
   259  					wiErr := &roachpb.WriteIntentError{Intents: intents}
   260  					guard, err := m.HandleWriterIntentError(ctx, prev, wiErr)
   261  					if err != nil {
   262  						log.Eventf(ctx, "handled %v, returned error: %v", wiErr, err)
   263  						c.mu.Lock()
   264  						delete(c.guardsByReqName, reqName)
   265  						c.mu.Unlock()
   266  					} else {
   267  						log.Eventf(ctx, "handled %v, released latches", wiErr)
   268  						c.mu.Lock()
   269  						c.guardsByReqName[reqName] = guard
   270  						c.mu.Unlock()
   271  					}
   272  				})
   273  				return c.waitAndCollect(t, mon)
   274  
   275  			case "on-lock-acquired":
   276  				var reqName string
   277  				d.ScanArgs(t, "req", &reqName)
   278  				guard, ok := c.guardsByReqName[reqName]
   279  				if !ok {
   280  					d.Fatalf(t, "unknown request: %s", reqName)
   281  				}
   282  				txn := guard.Req.Txn
   283  
   284  				var key string
   285  				d.ScanArgs(t, "key", &key)
   286  
   287  				var seq int
   288  				if d.HasArg("seq") {
   289  					d.ScanArgs(t, "seq", &seq)
   290  				}
   291  				seqNum := enginepb.TxnSeq(seq)
   292  
   293  				dur := lock.Unreplicated
   294  				if d.HasArg("dur") {
   295  					dur = scanLockDurability(t, d)
   296  				}
   297  
   298  				// Confirm that the request has a corresponding write request.
   299  				found := false
   300  				for _, ru := range guard.Req.Requests {
   301  					req := ru.GetInner()
   302  					keySpan := roachpb.Span{Key: roachpb.Key(key)}
   303  					if roachpb.IsLocking(req) &&
   304  						req.Header().Span().Contains(keySpan) &&
   305  						req.Header().Sequence == seqNum {
   306  						found = true
   307  						break
   308  					}
   309  				}
   310  				if !found {
   311  					d.Fatalf(t, "missing corresponding write request")
   312  				}
   313  
   314  				txnAcquire := txn.Clone()
   315  				txnAcquire.Sequence = seqNum
   316  
   317  				mon.runSync("acquire lock", func(ctx context.Context) {
   318  					log.Eventf(ctx, "txn %s @ %s", txn.ID.Short(), key)
   319  					acq := roachpb.MakeLockAcquisition(txnAcquire, roachpb.Key(key), dur)
   320  					m.OnLockAcquired(ctx, &acq)
   321  				})
   322  				return c.waitAndCollect(t, mon)
   323  
   324  			case "on-lock-updated":
   325  				var reqName string
   326  				d.ScanArgs(t, "req", &reqName)
   327  				guard, ok := c.guardsByReqName[reqName]
   328  				if !ok {
   329  					d.Fatalf(t, "unknown request: %s", reqName)
   330  				}
   331  
   332  				var txnName string
   333  				d.ScanArgs(t, "txn", &txnName)
   334  				txn, ok := c.txnsByName[txnName]
   335  				if !ok {
   336  					d.Fatalf(t, "unknown txn %s", txnName)
   337  				}
   338  
   339  				var key string
   340  				d.ScanArgs(t, "key", &key)
   341  
   342  				status, verb := scanTxnStatus(t, d)
   343  				var ts hlc.Timestamp
   344  				if d.HasArg("ts") {
   345  					ts = scanTimestamp(t, d)
   346  				}
   347  
   348  				// Confirm that the request has a corresponding ResolveIntent.
   349  				found := false
   350  				for _, ru := range guard.Req.Requests {
   351  					if riReq := ru.GetResolveIntent(); riReq != nil &&
   352  						riReq.IntentTxn.ID == txn.ID &&
   353  						riReq.Key.Equal(roachpb.Key(key)) &&
   354  						riReq.Status == status {
   355  						found = true
   356  						break
   357  					}
   358  				}
   359  				if !found {
   360  					d.Fatalf(t, "missing corresponding resolve intent request")
   361  				}
   362  
   363  				txnUpdate := txn.Clone()
   364  				txnUpdate.Status = status
   365  				txnUpdate.WriteTimestamp.Forward(ts)
   366  
   367  				mon.runSync("update lock", func(ctx context.Context) {
   368  					log.Eventf(ctx, "%s txn %s @ %s", verb, txn.ID.Short(), key)
   369  					span := roachpb.Span{Key: roachpb.Key(key)}
   370  					up := roachpb.MakeLockUpdate(txnUpdate, span)
   371  					m.OnLockUpdated(ctx, &up)
   372  				})
   373  				return c.waitAndCollect(t, mon)
   374  
   375  			case "on-txn-updated":
   376  				var txnName string
   377  				d.ScanArgs(t, "txn", &txnName)
   378  				txn, ok := c.txnsByName[txnName]
   379  				if !ok {
   380  					d.Fatalf(t, "unknown txn %s", txnName)
   381  				}
   382  
   383  				status, verb := scanTxnStatus(t, d)
   384  				var ts hlc.Timestamp
   385  				if d.HasArg("ts") {
   386  					ts = scanTimestamp(t, d)
   387  				}
   388  
   389  				mon.runSync("update txn", func(ctx context.Context) {
   390  					log.Eventf(ctx, "%s %s", verb, txnName)
   391  					if err := c.updateTxnRecord(txn.ID, status, ts); err != nil {
   392  						d.Fatalf(t, err.Error())
   393  					}
   394  				})
   395  				return c.waitAndCollect(t, mon)
   396  
   397  			case "on-lease-updated":
   398  				var isLeaseholder bool
   399  				d.ScanArgs(t, "leaseholder", &isLeaseholder)
   400  
   401  				mon.runSync("transfer lease", func(ctx context.Context) {
   402  					if isLeaseholder {
   403  						log.Event(ctx, "acquired")
   404  					} else {
   405  						log.Event(ctx, "released")
   406  					}
   407  					m.OnRangeLeaseUpdated(isLeaseholder)
   408  				})
   409  				return c.waitAndCollect(t, mon)
   410  
   411  			case "on-split":
   412  				mon.runSync("split range", func(ctx context.Context) {
   413  					log.Event(ctx, "complete")
   414  					m.OnRangeSplit()
   415  				})
   416  				return c.waitAndCollect(t, mon)
   417  
   418  			case "on-merge":
   419  				mon.runSync("merge range", func(ctx context.Context) {
   420  					log.Event(ctx, "complete")
   421  					m.OnRangeMerge()
   422  				})
   423  				return c.waitAndCollect(t, mon)
   424  
   425  			case "on-snapshot-applied":
   426  				mon.runSync("snapshot replica", func(ctx context.Context) {
   427  					log.Event(ctx, "applied")
   428  					m.OnReplicaSnapshotApplied()
   429  				})
   430  				return c.waitAndCollect(t, mon)
   431  
   432  			case "debug-latch-manager":
   433  				global, local := m.LatchMetrics()
   434  				output := []string{
   435  					fmt.Sprintf("write count: %d", global.WriteCount+local.WriteCount),
   436  					fmt.Sprintf(" read count: %d", global.ReadCount+local.ReadCount),
   437  				}
   438  				return strings.Join(output, "\n")
   439  
   440  			case "debug-lock-table":
   441  				return m.LockTableDebug()
   442  
   443  			case "debug-disable-txn-pushes":
   444  				c.disableTxnPushes()
   445  				return ""
   446  
   447  			case "reset":
   448  				if n := mon.numMonitored(); n > 0 {
   449  					d.Fatalf(t, "%d requests still in flight", n)
   450  				}
   451  				mon.resetSeqNums()
   452  				if err := c.reset(); err != nil {
   453  					d.Fatalf(t, "could not reset cluster: %v", err)
   454  				}
   455  				// Reset request and txn namespace?
   456  				if d.HasArg("namespace") {
   457  					c.resetNamespace()
   458  				}
   459  				return ""
   460  
   461  			default:
   462  				return fmt.Sprintf("unknown command: %s", d.Cmd)
   463  			}
   464  		})
   465  	})
   466  }
   467  
   468  // cluster encapsulates the state of a running cluster and a set of requests.
   469  // It serves as the test harness in TestConcurrencyManagerBasic - maintaining
   470  // transaction and request declarations, recording the state of in-flight
   471  // requests as they flow through the concurrency manager, and mocking out the
   472  // interfaces that the concurrency manager interacts with.
   473  type cluster struct {
   474  	nodeDesc  *roachpb.NodeDescriptor
   475  	rangeDesc *roachpb.RangeDescriptor
   476  	st        *clustersettings.Settings
   477  	m         concurrency.Manager
   478  
   479  	// Definitions.
   480  	txnCounter     uint32
   481  	txnsByName     map[string]*roachpb.Transaction
   482  	requestsByName map[string]concurrency.Request
   483  
   484  	// Request state. Cleared on reset.
   485  	mu              syncutil.Mutex
   486  	guardsByReqName map[string]*concurrency.Guard
   487  	txnRecords      map[uuid.UUID]*txnRecord
   488  	txnPushes       map[uuid.UUID]*txnPush
   489  }
   490  
   491  type txnRecord struct {
   492  	mu               syncutil.Mutex
   493  	sig              chan struct{}
   494  	txn              *roachpb.Transaction // immutable, modify fields below
   495  	updatedStatus    roachpb.TransactionStatus
   496  	updatedTimestamp hlc.Timestamp
   497  }
   498  
   499  type txnPush struct {
   500  	ctx            context.Context
   501  	pusher, pushee uuid.UUID
   502  }
   503  
   504  func newCluster() *cluster {
   505  	return &cluster{
   506  		st:        clustersettings.MakeTestingClusterSettings(),
   507  		nodeDesc:  &roachpb.NodeDescriptor{NodeID: 1},
   508  		rangeDesc: &roachpb.RangeDescriptor{RangeID: 1},
   509  
   510  		txnsByName:      make(map[string]*roachpb.Transaction),
   511  		requestsByName:  make(map[string]concurrency.Request),
   512  		guardsByReqName: make(map[string]*concurrency.Guard),
   513  		txnRecords:      make(map[uuid.UUID]*txnRecord),
   514  		txnPushes:       make(map[uuid.UUID]*txnPush),
   515  	}
   516  }
   517  
   518  func (c *cluster) makeConfig() concurrency.Config {
   519  	return concurrency.Config{
   520  		NodeDesc:       c.nodeDesc,
   521  		RangeDesc:      c.rangeDesc,
   522  		Settings:       c.st,
   523  		IntentResolver: c,
   524  		TxnWaitMetrics: txnwait.NewMetrics(time.Minute),
   525  	}
   526  }
   527  
   528  // PushTransaction implements the concurrency.IntentResolver interface.
   529  func (c *cluster) PushTransaction(
   530  	ctx context.Context, pushee *enginepb.TxnMeta, h roachpb.Header, pushType roachpb.PushTxnType,
   531  ) (*roachpb.Transaction, *roachpb.Error) {
   532  	pusheeRecord, err := c.getTxnRecord(pushee.ID)
   533  	if err != nil {
   534  		return nil, roachpb.NewError(err)
   535  	}
   536  	var pusherRecord *txnRecord
   537  	if h.Txn != nil {
   538  		pusherID := h.Txn.ID
   539  		pusherRecord, err = c.getTxnRecord(pusherID)
   540  		if err != nil {
   541  			return nil, roachpb.NewError(err)
   542  		}
   543  
   544  		push, err := c.registerPush(ctx, pusherID, pushee.ID)
   545  		if err != nil {
   546  			return nil, roachpb.NewError(err)
   547  		}
   548  		defer c.unregisterPush(push)
   549  	}
   550  	for {
   551  		// Is the pushee pushed?
   552  		pusheeTxn, pusheeRecordSig := pusheeRecord.asTxn()
   553  		var pushed bool
   554  		switch pushType {
   555  		case roachpb.PUSH_TIMESTAMP:
   556  			pushed = h.Timestamp.Less(pusheeTxn.WriteTimestamp) || pusheeTxn.Status.IsFinalized()
   557  		case roachpb.PUSH_ABORT:
   558  			pushed = pusheeTxn.Status.IsFinalized()
   559  		default:
   560  			return nil, roachpb.NewErrorf("unexpected push type: %s", pushType)
   561  		}
   562  		if pushed {
   563  			return pusheeTxn, nil
   564  		}
   565  		// Or the pusher aborted?
   566  		var pusherRecordSig chan struct{}
   567  		if pusherRecord != nil {
   568  			var pusherTxn *roachpb.Transaction
   569  			pusherTxn, pusherRecordSig = pusherRecord.asTxn()
   570  			if pusherTxn.Status == roachpb.ABORTED {
   571  				log.Eventf(ctx, "detected pusher aborted")
   572  				err := roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_PUSHER_ABORTED)
   573  				return nil, roachpb.NewError(err)
   574  			}
   575  		}
   576  		// Wait until either record is updated.
   577  		select {
   578  		case <-pusheeRecordSig:
   579  		case <-pusherRecordSig:
   580  		case <-ctx.Done():
   581  			return nil, roachpb.NewError(ctx.Err())
   582  		}
   583  	}
   584  }
   585  
   586  // ResolveIntent implements the concurrency.IntentResolver interface.
   587  func (c *cluster) ResolveIntent(
   588  	ctx context.Context, intent roachpb.LockUpdate, _ intentresolver.ResolveOptions,
   589  ) *roachpb.Error {
   590  	log.Eventf(ctx, "resolving intent %s for txn %s with %s status", intent.Key, intent.Txn.ID.Short(), intent.Status)
   591  	c.m.OnLockUpdated(ctx, &intent)
   592  	return nil
   593  }
   594  
   595  // ResolveIntents implements the concurrency.IntentResolver interface.
   596  func (c *cluster) ResolveIntents(
   597  	ctx context.Context, intents []roachpb.LockUpdate, opts intentresolver.ResolveOptions,
   598  ) *roachpb.Error {
   599  	log.Eventf(ctx, "resolving a batch of %d intent(s)", len(intents))
   600  	for _, intent := range intents {
   601  		if err := c.ResolveIntent(ctx, intent, opts); err != nil {
   602  			return err
   603  		}
   604  	}
   605  	return nil
   606  }
   607  
   608  func (c *cluster) newTxnID() uuid.UUID {
   609  	c.mu.Lock()
   610  	defer c.mu.Unlock()
   611  	return nextUUID(&c.txnCounter)
   612  }
   613  
   614  func (c *cluster) registerTxn(name string, txn *roachpb.Transaction) {
   615  	c.mu.Lock()
   616  	defer c.mu.Unlock()
   617  	c.txnsByName[name] = txn
   618  	r := &txnRecord{txn: txn, sig: make(chan struct{})}
   619  	c.txnRecords[txn.ID] = r
   620  }
   621  
   622  func (c *cluster) getTxnRecord(id uuid.UUID) (*txnRecord, error) {
   623  	c.mu.Lock()
   624  	defer c.mu.Unlock()
   625  	r, ok := c.txnRecords[id]
   626  	if !ok {
   627  		return nil, errors.Errorf("unknown txn %v: %v", id, c.txnRecords)
   628  	}
   629  	return r, nil
   630  }
   631  
   632  func (c *cluster) updateTxnRecord(
   633  	id uuid.UUID, status roachpb.TransactionStatus, ts hlc.Timestamp,
   634  ) error {
   635  	c.mu.Lock()
   636  	defer c.mu.Unlock()
   637  	r, ok := c.txnRecords[id]
   638  	if !ok {
   639  		return errors.Errorf("unknown txn %v: %v", id, c.txnRecords)
   640  	}
   641  	r.mu.Lock()
   642  	defer r.mu.Unlock()
   643  	r.updatedStatus = status
   644  	r.updatedTimestamp = ts
   645  	// Notify all listeners. This is a poor man's composable cond var.
   646  	close(r.sig)
   647  	r.sig = make(chan struct{})
   648  	return nil
   649  }
   650  
   651  func (r *txnRecord) asTxn() (*roachpb.Transaction, chan struct{}) {
   652  	r.mu.Lock()
   653  	defer r.mu.Unlock()
   654  	txn := r.txn.Clone()
   655  	if r.updatedStatus > txn.Status {
   656  		txn.Status = r.updatedStatus
   657  	}
   658  	txn.WriteTimestamp.Forward(r.updatedTimestamp)
   659  	return txn, r.sig
   660  }
   661  
   662  func (c *cluster) registerPush(ctx context.Context, pusher, pushee uuid.UUID) (*txnPush, error) {
   663  	c.mu.Lock()
   664  	defer c.mu.Unlock()
   665  	if _, ok := c.txnPushes[pusher]; ok {
   666  		return nil, errors.Errorf("txn %v already pushing", pusher)
   667  	}
   668  	p := &txnPush{
   669  		ctx:    ctx,
   670  		pusher: pusher,
   671  		pushee: pushee,
   672  	}
   673  	c.txnPushes[pusher] = p
   674  	return p, nil
   675  }
   676  
   677  func (c *cluster) unregisterPush(push *txnPush) {
   678  	c.mu.Lock()
   679  	defer c.mu.Unlock()
   680  	delete(c.txnPushes, push.pusher)
   681  }
   682  
   683  // detectDeadlocks looks at all in-flight transaction pushes and determines
   684  // whether any are blocked due to dependency cycles within transactions. If so,
   685  // the method logs an event on the contexts of each of the members of the cycle.
   686  func (c *cluster) detectDeadlocks() {
   687  	// This cycle detection algorithm it not particularly efficient - at worst
   688  	// it runs in O(n ^ 2) time. However, it's simple and effective at assigning
   689  	// each member of each cycle a unique view of the cycle that it's a part of.
   690  	// This works because we currently only allow a transaction to push a single
   691  	// other transaction at a time.
   692  	c.mu.Lock()
   693  	defer c.mu.Unlock()
   694  	var chain []uuid.UUID
   695  	seen := make(map[uuid.UUID]struct{})
   696  	for orig, origPush := range c.txnPushes {
   697  		pusher := orig
   698  		chain = append(chain[:0], orig)
   699  		for id := range seen {
   700  			delete(seen, id)
   701  		}
   702  		seen[pusher] = struct{}{}
   703  		for {
   704  			push, ok := c.txnPushes[pusher]
   705  			if !ok {
   706  				break
   707  			}
   708  			pusher = push.pushee
   709  			chain = append(chain, pusher)
   710  			if _, ok := seen[pusher]; ok {
   711  				// Cycle detected!
   712  				if pusher == orig {
   713  					// The cycle we were looking for (i.e. starting at orig).
   714  					var chainBuf strings.Builder
   715  					for i, id := range chain {
   716  						if i > 0 {
   717  							chainBuf.WriteString("->")
   718  						}
   719  						chainBuf.WriteString(id.Short())
   720  					}
   721  					log.Eventf(origPush.ctx, "dependency cycle detected %s", chainBuf.String())
   722  				}
   723  				break
   724  			}
   725  			seen[pusher] = struct{}{}
   726  		}
   727  	}
   728  }
   729  
   730  func (c *cluster) enableTxnPushes() {
   731  	concurrency.LockTableLivenessPushDelay.Override(&c.st.SV, 0*time.Millisecond)
   732  	concurrency.LockTableDeadlockDetectionPushDelay.Override(&c.st.SV, 0*time.Millisecond)
   733  }
   734  
   735  func (c *cluster) disableTxnPushes() {
   736  	concurrency.LockTableLivenessPushDelay.Override(&c.st.SV, time.Hour)
   737  	concurrency.LockTableDeadlockDetectionPushDelay.Override(&c.st.SV, time.Hour)
   738  }
   739  
   740  // reset clears all request state in the cluster. This avoids portions of tests
   741  // leaking into one another and also serves as an assertion that a sequence of
   742  // commands has completed without leaking any requests.
   743  func (c *cluster) reset() error {
   744  	c.mu.Lock()
   745  	defer c.mu.Unlock()
   746  	// Reset all transactions to their original state.
   747  	for id := range c.txnRecords {
   748  		r := c.txnRecords[id]
   749  		r.mu.Lock()
   750  		r.updatedStatus = roachpb.PENDING
   751  		r.updatedTimestamp = hlc.Timestamp{}
   752  		r.mu.Unlock()
   753  	}
   754  	// There should be no remaining concurrency guards.
   755  	for name := range c.guardsByReqName {
   756  		return errors.Errorf("unfinished guard for request: %s", name)
   757  	}
   758  	// There should be no outstanding latches.
   759  	global, local := c.m.LatchMetrics()
   760  	if global.ReadCount > 0 || global.WriteCount > 0 ||
   761  		local.ReadCount > 0 || local.WriteCount > 0 {
   762  		return errors.Errorf("outstanding latches")
   763  	}
   764  	// Clear the lock table by transferring the lease away and reacquiring it.
   765  	c.m.OnRangeLeaseUpdated(false /* isLeaseholder */)
   766  	c.m.OnRangeLeaseUpdated(true /* isLeaseholder */)
   767  	return nil
   768  }
   769  
   770  // resetNamespace resets the entire cluster namespace, clearing both request
   771  // definitions and transaction definitions.
   772  func (c *cluster) resetNamespace() {
   773  	c.mu.Lock()
   774  	defer c.mu.Unlock()
   775  	c.txnCounter = 0
   776  	c.txnsByName = make(map[string]*roachpb.Transaction)
   777  	c.requestsByName = make(map[string]concurrency.Request)
   778  	c.txnRecords = make(map[uuid.UUID]*txnRecord)
   779  }
   780  
   781  // collectSpans collects the declared spans for a set of requests.
   782  // Its logic mirrors that in Replica.collectSpans.
   783  func (c *cluster) collectSpans(
   784  	t *testing.T, txn *roachpb.Transaction, ts hlc.Timestamp, reqs []roachpb.Request,
   785  ) (latchSpans, lockSpans *spanset.SpanSet) {
   786  	latchSpans, lockSpans = &spanset.SpanSet{}, &spanset.SpanSet{}
   787  	h := roachpb.Header{Txn: txn, Timestamp: ts}
   788  	for _, req := range reqs {
   789  		if cmd, ok := batcheval.LookupCommand(req.Method()); ok {
   790  			cmd.DeclareKeys(c.rangeDesc, h, req, latchSpans, lockSpans)
   791  		} else {
   792  			t.Fatalf("unrecognized command %s", req.Method())
   793  		}
   794  	}
   795  
   796  	// Commands may create a large number of duplicate spans. De-duplicate
   797  	// them to reduce the number of spans we pass to the spanlatch manager.
   798  	for _, s := range [...]*spanset.SpanSet{latchSpans, lockSpans} {
   799  		s.SortAndDedup()
   800  		if err := s.Validate(); err != nil {
   801  			t.Fatal(err)
   802  		}
   803  	}
   804  	return latchSpans, lockSpans
   805  }
   806  
   807  func (c *cluster) waitAndCollect(t *testing.T, m *monitor) string {
   808  	m.waitForAsyncGoroutinesToStall(t)
   809  	c.detectDeadlocks()
   810  	return m.collectRecordings()
   811  }
   812  
   813  // monitor tracks a set of running goroutines as they execute and captures
   814  // tracing recordings from them. It is capable of watching its set of goroutines
   815  // until they all mutually stall.
   816  //
   817  // It is NOT safe to use multiple monitors concurrently.
   818  type monitor struct {
   819  	seq int
   820  	gs  map[*monitoredGoroutine]struct{}
   821  	buf []byte // avoids allocations
   822  }
   823  
   824  type monitoredGoroutine struct {
   825  	opSeq    int
   826  	opName   string
   827  	gID      int64
   828  	finished int32
   829  
   830  	ctx        context.Context
   831  	collect    func() tracing.Recording
   832  	cancel     func()
   833  	prevEvents int
   834  }
   835  
   836  func newMonitor() *monitor {
   837  	return &monitor{
   838  		gs: make(map[*monitoredGoroutine]struct{}),
   839  	}
   840  }
   841  
   842  func (m *monitor) runSync(opName string, fn func(context.Context)) {
   843  	ctx, collect, cancel := tracing.ContextWithRecordingSpan(context.Background(), opName)
   844  	g := &monitoredGoroutine{
   845  		opSeq:   0, // synchronous
   846  		opName:  opName,
   847  		ctx:     ctx,
   848  		collect: collect,
   849  		cancel:  cancel,
   850  	}
   851  	m.gs[g] = struct{}{}
   852  	fn(ctx)
   853  	atomic.StoreInt32(&g.finished, 1)
   854  }
   855  
   856  func (m *monitor) runAsync(opName string, fn func(context.Context)) {
   857  	m.seq++
   858  	ctx, collect, cancel := tracing.ContextWithRecordingSpan(context.Background(), opName)
   859  	g := &monitoredGoroutine{
   860  		opSeq:   m.seq,
   861  		opName:  opName,
   862  		ctx:     ctx,
   863  		collect: collect,
   864  		cancel:  cancel,
   865  	}
   866  	m.gs[g] = struct{}{}
   867  	go func() {
   868  		atomic.StoreInt64(&g.gID, goid.Get())
   869  		fn(ctx)
   870  		atomic.StoreInt32(&g.finished, 1)
   871  	}()
   872  }
   873  
   874  func (m *monitor) numMonitored() int {
   875  	return len(m.gs)
   876  }
   877  
   878  func (m *monitor) resetSeqNums() {
   879  	m.seq = 0
   880  }
   881  
   882  func (m *monitor) collectRecordings() string {
   883  	// Collect trace recordings from each goroutine.
   884  	type logRecord struct {
   885  		g     *monitoredGoroutine
   886  		value string
   887  	}
   888  	var logs []logRecord
   889  	for g := range m.gs {
   890  		prev := g.prevEvents
   891  		rec := g.collect()
   892  		for _, span := range rec {
   893  			for _, log := range span.Logs {
   894  				for _, field := range log.Fields {
   895  					if prev > 0 {
   896  						prev--
   897  						continue
   898  					}
   899  					logs = append(logs, logRecord{
   900  						g: g, value: field.Value,
   901  					})
   902  					g.prevEvents++
   903  				}
   904  			}
   905  		}
   906  		if atomic.LoadInt32(&g.finished) == 1 {
   907  			g.cancel()
   908  			delete(m.gs, g)
   909  		}
   910  	}
   911  
   912  	// Sort logs by g.opSeq. This will sort synchronous goroutines before
   913  	// asynchronous goroutines. Use a stable sort to break ties within
   914  	// goroutines and keep logs in event order.
   915  	sort.SliceStable(logs, func(i int, j int) bool {
   916  		return logs[i].g.opSeq < logs[j].g.opSeq
   917  	})
   918  
   919  	var buf strings.Builder
   920  	for i, log := range logs {
   921  		if i > 0 {
   922  			buf.WriteByte('\n')
   923  		}
   924  		seq := "-"
   925  		if log.g.opSeq != 0 {
   926  			seq = strconv.Itoa(log.g.opSeq)
   927  		}
   928  		fmt.Fprintf(&buf, "[%s] %s: %s", seq, log.g.opName, log.value)
   929  	}
   930  	return buf.String()
   931  }
   932  
   933  func (m *monitor) hasNewEvents(g *monitoredGoroutine) bool {
   934  	events := 0
   935  	rec := g.collect()
   936  	for _, span := range rec {
   937  		for _, log := range span.Logs {
   938  			for range log.Fields {
   939  				events++
   940  			}
   941  		}
   942  	}
   943  	return events > g.prevEvents
   944  }
   945  
   946  // waitForAsyncGoroutinesToStall waits for all goroutines that were launched by
   947  // the monitor's runAsync method to stall due to cross-goroutine synchronization
   948  // dependencies. For instance, it waits for all goroutines to stall while
   949  // receiving from channels. When the method returns, the caller has exclusive
   950  // access to any memory that it shares only with monitored goroutines until it
   951  // performs an action that may unblock any of the goroutines.
   952  func (m *monitor) waitForAsyncGoroutinesToStall(t *testing.T) {
   953  	// Iterate until we see two iterations in a row that both observe all
   954  	// monitored goroutines to be stalled and also both observe the exact same
   955  	// goroutine state. The goroutine dump provides a consistent snapshot of all
   956  	// goroutine states and statuses (runtime.Stack(all=true) stops the world).
   957  	var status []*stack.Goroutine
   958  	filter := funcName((*monitor).runAsync)
   959  	testutils.SucceedsSoon(t, func() error {
   960  		// Add a small fixed delay after each iteration. This is sufficient to
   961  		// prevents false detection of stalls in a few cases, like when
   962  		// receiving on a buffered channel that already has an element in it.
   963  		defer time.Sleep(1 * time.Millisecond)
   964  
   965  		prevStatus := status
   966  		status = goroutineStatus(t, filter, &m.buf)
   967  		if len(status) == 0 {
   968  			// No monitored goroutines.
   969  			return nil
   970  		}
   971  
   972  		if prevStatus == nil {
   973  			// First iteration.
   974  			return errors.Errorf("previous status unset")
   975  		}
   976  
   977  		// Check whether all monitored goroutines are stalled. If not, retry.
   978  		for _, stat := range status {
   979  			stalled, ok := goroutineStalledStates[stat.State]
   980  			if !ok {
   981  				// NB: this will help us avoid rotting on Go runtime changes.
   982  				t.Fatalf("unknown goroutine state: %s", stat.State)
   983  			}
   984  			if !stalled {
   985  				return errors.Errorf("goroutine %d is not stalled; status %s", stat.ID, stat.State)
   986  			}
   987  		}
   988  
   989  		// Make sure the goroutines haven't changed since the last iteration.
   990  		// This ensures that the goroutines stay stable for some amount of time.
   991  		// NB: status and lastStatus are sorted by goroutine id.
   992  		if !reflect.DeepEqual(status, prevStatus) {
   993  			return errors.Errorf("goroutines rapidly changing")
   994  		}
   995  		return nil
   996  	})
   997  
   998  	// Add a trace event indicating where each stalled goroutine is waiting.
   999  	byID := make(map[int64]*monitoredGoroutine, len(m.gs))
  1000  	for g := range m.gs {
  1001  		byID[atomic.LoadInt64(&g.gID)] = g
  1002  	}
  1003  	for _, stat := range status {
  1004  		g, ok := byID[int64(stat.ID)]
  1005  		if !ok {
  1006  			// If we're not tracking this goroutine, just ignore it. This can
  1007  			// happen when goroutines from earlier subtests haven't finished
  1008  			// yet.
  1009  			continue
  1010  		}
  1011  		// If the goroutine hasn't produced any new events, don't create a new
  1012  		// "blocked on" trace event. It likely hasn't moved since the last one.
  1013  		if !m.hasNewEvents(g) {
  1014  			continue
  1015  		}
  1016  		stalledCall := firstNonStdlib(stat.Stack.Calls)
  1017  		log.Eventf(g.ctx, "blocked on %s in %s", stat.State, stalledCall.Func.PkgDotName())
  1018  	}
  1019  }
  1020  
  1021  func funcName(f interface{}) string {
  1022  	return runtime.FuncForPC(reflect.ValueOf(f).Pointer()).Name()
  1023  }
  1024  
  1025  // goroutineStalledStates maps all goroutine states as reported by runtime.Stack
  1026  // to a boolean representing whether that state indicates a stalled goroutine. A
  1027  // stalled goroutine is one that is waiting on a change on another goroutine in
  1028  // order for it to make forward progress itself. If all goroutines enter stalled
  1029  // states simultaneously, a process would encounter a deadlock.
  1030  var goroutineStalledStates = map[string]bool{
  1031  	// See gStatusStrings in runtime/traceback.go and associated comments about
  1032  	// the corresponding G statuses in runtime/runtime2.go.
  1033  	"idle":      false,
  1034  	"runnable":  false,
  1035  	"running":   false,
  1036  	"syscall":   false,
  1037  	"waiting":   true,
  1038  	"dead":      false,
  1039  	"copystack": false,
  1040  	"???":       false, // catch-all in runtime.goroutineheader
  1041  
  1042  	// runtime.goroutineheader may override these G statuses with a waitReason.
  1043  	// See waitReasonStrings in runtime/runtime2.go.
  1044  	"GC assist marking":       false,
  1045  	"IO wait":                 false,
  1046  	"chan receive (nil chan)": true,
  1047  	"chan send (nil chan)":    true,
  1048  	"dumping heap":            false,
  1049  	"garbage collection":      false,
  1050  	"garbage collection scan": false,
  1051  	"panicwait":               false,
  1052  	"select":                  true,
  1053  	"select (no cases)":       true,
  1054  	"GC assist wait":          false,
  1055  	"GC sweep wait":           false,
  1056  	"GC scavenge wait":        false,
  1057  	"chan receive":            true,
  1058  	"chan send":               true,
  1059  	"finalizer wait":          false,
  1060  	"force gc (idle)":         false,
  1061  	// Perhaps surprisingly, we mark "semacquire" as a non-stalled state. This
  1062  	// is because it is possible to see goroutines briefly enter this state when
  1063  	// performing fine-grained memory synchronization, occasionally in the Go
  1064  	// runtime itself. No request-level synchronization points use mutexes to
  1065  	// wait for state transitions by other requests, so it is safe to ignore
  1066  	// this state and wait for it to exit.
  1067  	"semacquire":             false,
  1068  	"sleep":                  false,
  1069  	"sync.Cond.Wait":         true,
  1070  	"timer goroutine (idle)": false,
  1071  	"trace reader (blocked)": false,
  1072  	"wait for GC cycle":      false,
  1073  	"GC worker (idle)":       false,
  1074  }
  1075  
  1076  // goroutineStatus returns a stack trace for each goroutine whose stack frame
  1077  // matches the provided filter. It uses the provided buffer to avoid repeat
  1078  // allocations.
  1079  func goroutineStatus(t *testing.T, filter string, buf *[]byte) []*stack.Goroutine {
  1080  	// We don't know how big the buffer needs to be to collect all the
  1081  	// goroutines. Start with 64 KB and try a few times, doubling each time.
  1082  	// NB: This is inspired by runtime/pprof/pprof.go:writeGoroutineStacks.
  1083  	if len(*buf) == 0 {
  1084  		*buf = make([]byte, 1<<16)
  1085  	}
  1086  	var truncBuf []byte
  1087  	for i := 0; ; i++ {
  1088  		n := runtime.Stack(*buf, true /* all */)
  1089  		if n < len(*buf) {
  1090  			truncBuf = (*buf)[:n]
  1091  			break
  1092  		}
  1093  		*buf = make([]byte, 2*len(*buf))
  1094  	}
  1095  
  1096  	// guesspaths=true is required for Call objects to have IsStdlib filled in.
  1097  	guesspaths := true
  1098  	ctx, err := stack.ParseDump(bytes.NewBuffer(truncBuf), ioutil.Discard, guesspaths)
  1099  	if err != nil {
  1100  		t.Fatalf("could not parse goroutine dump: %v", err)
  1101  		return nil
  1102  	}
  1103  
  1104  	matching := ctx.Goroutines[:0]
  1105  	for _, g := range ctx.Goroutines {
  1106  		for _, call := range g.Stack.Calls {
  1107  			if strings.Contains(call.Func.Raw, filter) {
  1108  				matching = append(matching, g)
  1109  				break
  1110  			}
  1111  		}
  1112  	}
  1113  	return matching
  1114  }
  1115  
  1116  func firstNonStdlib(calls []stack.Call) stack.Call {
  1117  	for _, call := range calls {
  1118  		if !call.IsStdlib {
  1119  			return call
  1120  		}
  1121  	}
  1122  	panic("unexpected")
  1123  }