github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_metrics_test.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"sync"
    17  	"testing"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/base"
    21  	"github.com/cockroachdb/cockroach/pkg/keys"
    22  	"github.com/cockroachdb/cockroach/pkg/kv"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/testutils"
    27  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    28  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    29  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    30  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    31  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    32  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    33  	"github.com/cockroachdb/errors"
    34  	"github.com/stretchr/testify/require"
    35  )
    36  
    37  func checkGauge(t *testing.T, id string, g *metric.Gauge, e int64) {
    38  	t.Helper()
    39  	if a := g.Value(); a != e {
    40  		t.Error(errors.Errorf("%s for store %s: gauge %d != computed %d", g.GetName(), id, a, e))
    41  	}
    42  }
    43  
    44  func verifyStats(t *testing.T, mtc *multiTestContext, storeIdxSlice ...int) {
    45  	t.Helper()
    46  	var stores []*kvserver.Store
    47  	var wg sync.WaitGroup
    48  
    49  	mtc.mu.RLock()
    50  	numStores := len(mtc.stores)
    51  	// We need to stop the stores at the given indexes, while keeping the reference to the
    52  	// store objects. ComputeMVCCStats() still works on a stopped store (it needs
    53  	// only the engine, which is still open), and the most recent stats are still
    54  	// available on the stopped store object; however, no further information can
    55  	// be committed to the store while it is stopped, preventing any races during
    56  	// verification.
    57  	for _, storeIdx := range storeIdxSlice {
    58  		stores = append(stores, mtc.stores[storeIdx])
    59  	}
    60  	mtc.mu.RUnlock()
    61  
    62  	// Sanity regression check for bug #4624: ensure intent count is zero.
    63  	// This may not be true immediately due to the asynchronous nature of
    64  	// non-local intent resolution.
    65  	for _, s := range stores {
    66  		m := s.Metrics()
    67  		testutils.SucceedsSoon(t, func() error {
    68  			if a := m.IntentCount.Value(); a != 0 {
    69  				return fmt.Errorf("expected intent count to be zero, was %d", a)
    70  			}
    71  			return nil
    72  		})
    73  	}
    74  
    75  	wg.Add(numStores)
    76  	// We actually stop *all* of the Stores. Stopping only a few is riddled
    77  	// with deadlocks since operations can span nodes, but stoppers don't
    78  	// know about this - taking all of them down at the same time is the
    79  	// only sane way of guaranteeing that nothing interesting happens, at
    80  	// least when bringing down the nodes jeopardizes majorities.
    81  	for i := 0; i < numStores; i++ {
    82  		go func(i int) {
    83  			defer wg.Done()
    84  			mtc.stopStore(i)
    85  		}(i)
    86  	}
    87  	wg.Wait()
    88  
    89  	for _, s := range stores {
    90  		idString := s.Ident.String()
    91  		m := s.Metrics()
    92  
    93  		// Sanity check: LiveBytes is not zero (ensures we don't have
    94  		// zeroed out structures.)
    95  		if liveBytes := m.LiveBytes.Value(); liveBytes == 0 {
    96  			t.Errorf("store %s; got zero live bytes, expected non-zero", idString)
    97  		}
    98  
    99  		// Compute real total MVCC statistics from store.
   100  		realStats, err := s.ComputeMVCCStats()
   101  		if err != nil {
   102  			t.Fatal(err)
   103  		}
   104  
   105  		// Ensure that real MVCC stats match computed stats.
   106  		checkGauge(t, idString, m.LiveBytes, realStats.LiveBytes)
   107  		checkGauge(t, idString, m.KeyBytes, realStats.KeyBytes)
   108  		checkGauge(t, idString, m.ValBytes, realStats.ValBytes)
   109  		checkGauge(t, idString, m.IntentBytes, realStats.IntentBytes)
   110  		checkGauge(t, idString, m.LiveCount, realStats.LiveCount)
   111  		checkGauge(t, idString, m.KeyCount, realStats.KeyCount)
   112  		checkGauge(t, idString, m.ValCount, realStats.ValCount)
   113  		checkGauge(t, idString, m.IntentCount, realStats.IntentCount)
   114  		checkGauge(t, idString, m.SysBytes, realStats.SysBytes)
   115  		checkGauge(t, idString, m.SysCount, realStats.SysCount)
   116  		// "Ages" will be different depending on how much time has passed. Even with
   117  		// a manual clock, this can be an issue in tests. Therefore, we do not
   118  		// verify them in this test.
   119  	}
   120  
   121  	if t.Failed() {
   122  		t.Fatalf("verifyStats failed, aborting test.")
   123  	}
   124  
   125  	// Restart all Stores.
   126  	for i := 0; i < numStores; i++ {
   127  		mtc.restartStore(i)
   128  	}
   129  }
   130  
   131  func verifyRocksDBStats(t *testing.T, s *kvserver.Store) {
   132  	if err := s.ComputeMetrics(context.Background(), 0); err != nil {
   133  		t.Fatal(err)
   134  	}
   135  
   136  	m := s.Metrics()
   137  	testcases := []struct {
   138  		gauge *metric.Gauge
   139  		min   int64
   140  	}{
   141  		{m.RdbBlockCacheHits, 10},
   142  		{m.RdbBlockCacheMisses, 0},
   143  		{m.RdbBlockCacheUsage, 0},
   144  		{m.RdbBlockCachePinnedUsage, 0},
   145  		{m.RdbBloomFilterPrefixChecked, 20},
   146  		{m.RdbBloomFilterPrefixUseful, 20},
   147  		{m.RdbMemtableTotalSize, 5000},
   148  		{m.RdbFlushes, 1},
   149  		{m.RdbCompactions, 0},
   150  		{m.RdbTableReadersMemEstimate, 50},
   151  	}
   152  	for _, tc := range testcases {
   153  		if a := tc.gauge.Value(); a < tc.min {
   154  			t.Errorf("gauge %s = %d < min %d", tc.gauge.GetName(), a, tc.min)
   155  		}
   156  	}
   157  }
   158  
   159  // TestStoreResolveMetrics verifies that metrics related to intent resolution
   160  // are tracked properly.
   161  func TestStoreResolveMetrics(t *testing.T) {
   162  	defer leaktest.AfterTest(t)()
   163  
   164  	// First prevent rot that would result from adding fields without handling
   165  	// them everywhere.
   166  	{
   167  		act := fmt.Sprintf("%+v", result.Metrics{})
   168  		exp := "{LeaseRequestSuccess:0 LeaseRequestError:0 LeaseTransferSuccess:0 LeaseTransferError:0 ResolveCommit:0 ResolveAbort:0 ResolvePoison:0}"
   169  		if act != exp {
   170  			t.Errorf("need to update this test due to added fields: %v", act)
   171  		}
   172  	}
   173  
   174  	mtc := &multiTestContext{}
   175  	defer mtc.Stop()
   176  	mtc.Start(t, 1)
   177  
   178  	ctx := context.Background()
   179  
   180  	span := roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")}
   181  
   182  	txn := roachpb.MakeTransaction("foo", span.Key, roachpb.MinUserPriority, hlc.Timestamp{WallTime: 123}, 999)
   183  
   184  	const resolveCommitCount = int64(200)
   185  	const resolveAbortCount = int64(800)
   186  	const resolvePoisonCount = int64(2400)
   187  
   188  	var ba roachpb.BatchRequest
   189  	{
   190  		repl := mtc.stores[0].LookupReplica(keys.MustAddr(span.Key))
   191  		var err error
   192  		if ba.Replica, err = repl.GetReplicaDescriptor(); err != nil {
   193  			t.Fatal(err)
   194  		}
   195  		ba.RangeID = repl.RangeID
   196  	}
   197  
   198  	add := func(status roachpb.TransactionStatus, poison bool, n int64) {
   199  		for i := int64(0); i < n; i++ {
   200  			key := span.Key
   201  			endKey := span.EndKey
   202  			if i > n/2 {
   203  				req := &roachpb.ResolveIntentRangeRequest{
   204  					IntentTxn: txn.TxnMeta,
   205  					Status:    status,
   206  					Poison:    poison,
   207  				}
   208  				req.Key, req.EndKey = key, endKey
   209  				ba.Add(req)
   210  				continue
   211  			}
   212  			req := &roachpb.ResolveIntentRequest{
   213  				IntentTxn: txn.TxnMeta,
   214  				Status:    status,
   215  				Poison:    poison,
   216  			}
   217  			req.Key = key
   218  			ba.Add(req)
   219  		}
   220  	}
   221  
   222  	add(roachpb.COMMITTED, false, resolveCommitCount)
   223  	add(roachpb.ABORTED, false, resolveAbortCount)
   224  	add(roachpb.ABORTED, true, resolvePoisonCount)
   225  
   226  	if _, pErr := mtc.senders[0].Send(ctx, ba); pErr != nil {
   227  		t.Fatal(pErr)
   228  	}
   229  
   230  	if exp, act := resolveCommitCount, mtc.stores[0].Metrics().ResolveCommitCount.Count(); act < exp || act > exp+50 {
   231  		t.Errorf("expected around %d intent commits, saw %d", exp, act)
   232  	}
   233  	if exp, act := resolveAbortCount, mtc.stores[0].Metrics().ResolveAbortCount.Count(); act < exp || act > exp+50 {
   234  		t.Errorf("expected around %d intent aborts, saw %d", exp, act)
   235  	}
   236  	if exp, act := resolvePoisonCount, mtc.stores[0].Metrics().ResolvePoisonCount.Count(); act < exp || act > exp+50 {
   237  		t.Errorf("expected arounc %d abort span poisonings, saw %d", exp, act)
   238  	}
   239  }
   240  
   241  func TestStoreMetrics(t *testing.T) {
   242  	defer leaktest.AfterTest(t)()
   243  
   244  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
   245  	storeCfg.TestingKnobs.DisableMergeQueue = true
   246  	mtc := &multiTestContext{
   247  		storeConfig: &storeCfg,
   248  		// This test was written before the multiTestContext started creating many
   249  		// system ranges at startup, and hasn't been update to take that into
   250  		// account.
   251  		startWithSingleRange: true,
   252  	}
   253  	defer mtc.Stop()
   254  	mtc.Start(t, 3)
   255  
   256  	// Flush RocksDB memtables, so that RocksDB begins using block-based tables.
   257  	// This is useful, because most of the stats we track don't apply to
   258  	// memtables.
   259  	if err := mtc.stores[0].Engine().Flush(); err != nil {
   260  		t.Fatal(err)
   261  	}
   262  	if err := mtc.stores[1].Engine().Flush(); err != nil {
   263  		t.Fatal(err)
   264  	}
   265  
   266  	// Disable the raft log truncation which confuses this test.
   267  	for _, s := range mtc.stores {
   268  		s.SetRaftLogQueueActive(false)
   269  	}
   270  
   271  	// Perform a split, which has special metrics handling.
   272  	splitArgs := adminSplitArgs(roachpb.Key("m"))
   273  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil {
   274  		t.Fatal(err)
   275  	}
   276  
   277  	// Verify range count is as expected
   278  	checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 2)
   279  
   280  	// Verify all stats on store0 after split.
   281  	verifyStats(t, mtc, 0)
   282  
   283  	// Replicate the "right" range to the other stores.
   284  	replica := mtc.stores[0].LookupReplica(roachpb.RKey("z"))
   285  	mtc.replicateRange(replica.RangeID, 1, 2)
   286  
   287  	// Verify stats on store1 after replication.
   288  	verifyStats(t, mtc, 1)
   289  
   290  	// Add some data to the "right" range.
   291  	dataKey := []byte("z")
   292  	if _, err := mtc.dbs[0].Inc(context.Background(), dataKey, 5); err != nil {
   293  		t.Fatal(err)
   294  	}
   295  	mtc.waitForValues(roachpb.Key("z"), []int64{5, 5, 5})
   296  
   297  	// Verify all stats on stores after addition.
   298  	verifyStats(t, mtc, 0, 1, 2)
   299  
   300  	// Create a transaction statement that fails. Regression test for #4969.
   301  	if err := mtc.dbs[0].Txn(context.Background(), func(ctx context.Context, txn *kv.Txn) error {
   302  		b := txn.NewBatch()
   303  		var expVal roachpb.Value
   304  		expVal.SetInt(6)
   305  		b.CPut(dataKey, 7, &expVal)
   306  		return txn.Run(ctx, b)
   307  	}); err == nil {
   308  		t.Fatal("Expected transaction error, but none received")
   309  	}
   310  
   311  	// Verify stats after addition.
   312  	verifyStats(t, mtc, 0, 1, 2)
   313  	checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 2)
   314  
   315  	// Unreplicate range from the first store.
   316  	testutils.SucceedsSoon(t, func() error {
   317  		// This statement can fail if store 0 is not the leaseholder.
   318  		if err := mtc.transferLeaseNonFatal(context.Background(), replica.RangeID, 0, 1); err != nil {
   319  			t.Log(err)
   320  		}
   321  		// This statement will fail if store 0 IS the leaseholder. This can happen
   322  		// even after the previous statement.
   323  		return mtc.unreplicateRangeNonFatal(replica.RangeID, 0)
   324  	})
   325  
   326  	// Wait until we're sure that store 0 has successfully processed its removal.
   327  	require.NoError(t, mtc.waitForUnreplicated(replica.RangeID, 0))
   328  
   329  	mtc.waitForValues(roachpb.Key("z"), []int64{0, 5, 5})
   330  
   331  	// Verify range count is as expected.
   332  	checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 1)
   333  	checkGauge(t, "store 1", mtc.stores[1].Metrics().ReplicaCount, 1)
   334  
   335  	// Verify all stats on all stores after range is removed.
   336  	verifyStats(t, mtc, 0, 1, 2)
   337  
   338  	verifyRocksDBStats(t, mtc.stores[0])
   339  	verifyRocksDBStats(t, mtc.stores[1])
   340  }
   341  
   342  // TestStoreMaxBehindNanosOnlyTracksEpochBasedLeases ensures that the metric
   343  // ClosedTimestampMaxBehindNanos does not follow the start time of expiration
   344  // based leases. Expiration based leases don't publish closed timestamps.
   345  func TestStoreMaxBehindNanosOnlyTracksEpochBasedLeases(t *testing.T) {
   346  	defer leaktest.AfterTest(t)()
   347  
   348  	ctx := context.Background()
   349  	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
   350  		ServerArgs: base.TestServerArgs{
   351  			// Set a long timeout so that no lease or liveness ever times out.
   352  			RaftConfig: base.RaftConfig{RaftElectionTimeoutTicks: 100},
   353  		},
   354  	})
   355  	defer tc.Stopper().Stop(ctx)
   356  	tdb := sqlutils.MakeSQLRunner(tc.ServerConn(0))
   357  	// We want to choose setting values such that this test doesn't take too long
   358  	// with the caveat that under extreme stress, we need to make sure that the
   359  	// subsystem remains live.
   360  	const closedTimestampDuration = 15 * time.Millisecond
   361  	const closedTimestampFraction = 1
   362  	tdb.Exec(t, "SET CLUSTER SETTING kv.closed_timestamp.target_duration = $1",
   363  		closedTimestampDuration.String())
   364  	tdb.Exec(t, "SET CLUSTER SETTING kv.closed_timestamp.close_fraction = $1",
   365  		closedTimestampFraction)
   366  
   367  	// Let's get to a point where we know that we have an expiration based lease
   368  	// with a start time more than some time ago and then we have a max closed
   369  	// value more recent.
   370  	_, meta2Repl1 := getFirstStoreReplica(t, tc.Server(0), keys.Meta2Prefix)
   371  
   372  	// Transfer the lease for the meta range to ensure that it has a non-zero
   373  	// start time.
   374  	require.NoError(t, tc.TransferRangeLease(*meta2Repl1.Desc(), tc.Target(1)))
   375  
   376  	testutils.SucceedsSoon(t, func() error {
   377  		_, metaRepl := getFirstStoreReplica(t, tc.Server(1), keys.Meta2Prefix)
   378  		l, _ := metaRepl.GetLease()
   379  		if l.Start == (hlc.Timestamp{}) {
   380  			return errors.Errorf("don't have a lease for meta1 yet: %v %v", l, meta2Repl1)
   381  		}
   382  		sinceExpBasedLeaseStart := timeutil.Since(timeutil.Unix(0, l.Start.WallTime))
   383  		for i := 0; i < tc.NumServers(); i++ {
   384  			s, _ := getFirstStoreReplica(t, tc.Server(i), keys.Meta1Prefix)
   385  			require.NoError(t, s.ComputeMetrics(ctx, 0))
   386  			maxBehind := time.Duration(s.Metrics().ClosedTimestampMaxBehindNanos.Value())
   387  			// We want to make sure that maxBehind ends up being much smaller than the
   388  			// start of an expiration based lease.
   389  			const behindMultiple = 5
   390  			if maxBehind*behindMultiple > sinceExpBasedLeaseStart {
   391  				return errors.Errorf("store %v has a ClosedTimestampMaxBehindNanos"+
   392  					" of %v which is not way less than the an expiration-based lease start, %v",
   393  					s.StoreID(), maxBehind, sinceExpBasedLeaseStart)
   394  			}
   395  		}
   396  		return nil
   397  	})
   398  }