github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_metrics_test.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver_test 12 13 import ( 14 "context" 15 "fmt" 16 "sync" 17 "testing" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/base" 21 "github.com/cockroachdb/cockroach/pkg/keys" 22 "github.com/cockroachdb/cockroach/pkg/kv" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 24 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" 25 "github.com/cockroachdb/cockroach/pkg/roachpb" 26 "github.com/cockroachdb/cockroach/pkg/testutils" 27 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 28 "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" 29 "github.com/cockroachdb/cockroach/pkg/util/hlc" 30 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 31 "github.com/cockroachdb/cockroach/pkg/util/metric" 32 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 33 "github.com/cockroachdb/errors" 34 "github.com/stretchr/testify/require" 35 ) 36 37 func checkGauge(t *testing.T, id string, g *metric.Gauge, e int64) { 38 t.Helper() 39 if a := g.Value(); a != e { 40 t.Error(errors.Errorf("%s for store %s: gauge %d != computed %d", g.GetName(), id, a, e)) 41 } 42 } 43 44 func verifyStats(t *testing.T, mtc *multiTestContext, storeIdxSlice ...int) { 45 t.Helper() 46 var stores []*kvserver.Store 47 var wg sync.WaitGroup 48 49 mtc.mu.RLock() 50 numStores := len(mtc.stores) 51 // We need to stop the stores at the given indexes, while keeping the reference to the 52 // store objects. ComputeMVCCStats() still works on a stopped store (it needs 53 // only the engine, which is still open), and the most recent stats are still 54 // available on the stopped store object; however, no further information can 55 // be committed to the store while it is stopped, preventing any races during 56 // verification. 57 for _, storeIdx := range storeIdxSlice { 58 stores = append(stores, mtc.stores[storeIdx]) 59 } 60 mtc.mu.RUnlock() 61 62 // Sanity regression check for bug #4624: ensure intent count is zero. 63 // This may not be true immediately due to the asynchronous nature of 64 // non-local intent resolution. 65 for _, s := range stores { 66 m := s.Metrics() 67 testutils.SucceedsSoon(t, func() error { 68 if a := m.IntentCount.Value(); a != 0 { 69 return fmt.Errorf("expected intent count to be zero, was %d", a) 70 } 71 return nil 72 }) 73 } 74 75 wg.Add(numStores) 76 // We actually stop *all* of the Stores. Stopping only a few is riddled 77 // with deadlocks since operations can span nodes, but stoppers don't 78 // know about this - taking all of them down at the same time is the 79 // only sane way of guaranteeing that nothing interesting happens, at 80 // least when bringing down the nodes jeopardizes majorities. 81 for i := 0; i < numStores; i++ { 82 go func(i int) { 83 defer wg.Done() 84 mtc.stopStore(i) 85 }(i) 86 } 87 wg.Wait() 88 89 for _, s := range stores { 90 idString := s.Ident.String() 91 m := s.Metrics() 92 93 // Sanity check: LiveBytes is not zero (ensures we don't have 94 // zeroed out structures.) 95 if liveBytes := m.LiveBytes.Value(); liveBytes == 0 { 96 t.Errorf("store %s; got zero live bytes, expected non-zero", idString) 97 } 98 99 // Compute real total MVCC statistics from store. 100 realStats, err := s.ComputeMVCCStats() 101 if err != nil { 102 t.Fatal(err) 103 } 104 105 // Ensure that real MVCC stats match computed stats. 106 checkGauge(t, idString, m.LiveBytes, realStats.LiveBytes) 107 checkGauge(t, idString, m.KeyBytes, realStats.KeyBytes) 108 checkGauge(t, idString, m.ValBytes, realStats.ValBytes) 109 checkGauge(t, idString, m.IntentBytes, realStats.IntentBytes) 110 checkGauge(t, idString, m.LiveCount, realStats.LiveCount) 111 checkGauge(t, idString, m.KeyCount, realStats.KeyCount) 112 checkGauge(t, idString, m.ValCount, realStats.ValCount) 113 checkGauge(t, idString, m.IntentCount, realStats.IntentCount) 114 checkGauge(t, idString, m.SysBytes, realStats.SysBytes) 115 checkGauge(t, idString, m.SysCount, realStats.SysCount) 116 // "Ages" will be different depending on how much time has passed. Even with 117 // a manual clock, this can be an issue in tests. Therefore, we do not 118 // verify them in this test. 119 } 120 121 if t.Failed() { 122 t.Fatalf("verifyStats failed, aborting test.") 123 } 124 125 // Restart all Stores. 126 for i := 0; i < numStores; i++ { 127 mtc.restartStore(i) 128 } 129 } 130 131 func verifyRocksDBStats(t *testing.T, s *kvserver.Store) { 132 if err := s.ComputeMetrics(context.Background(), 0); err != nil { 133 t.Fatal(err) 134 } 135 136 m := s.Metrics() 137 testcases := []struct { 138 gauge *metric.Gauge 139 min int64 140 }{ 141 {m.RdbBlockCacheHits, 10}, 142 {m.RdbBlockCacheMisses, 0}, 143 {m.RdbBlockCacheUsage, 0}, 144 {m.RdbBlockCachePinnedUsage, 0}, 145 {m.RdbBloomFilterPrefixChecked, 20}, 146 {m.RdbBloomFilterPrefixUseful, 20}, 147 {m.RdbMemtableTotalSize, 5000}, 148 {m.RdbFlushes, 1}, 149 {m.RdbCompactions, 0}, 150 {m.RdbTableReadersMemEstimate, 50}, 151 } 152 for _, tc := range testcases { 153 if a := tc.gauge.Value(); a < tc.min { 154 t.Errorf("gauge %s = %d < min %d", tc.gauge.GetName(), a, tc.min) 155 } 156 } 157 } 158 159 // TestStoreResolveMetrics verifies that metrics related to intent resolution 160 // are tracked properly. 161 func TestStoreResolveMetrics(t *testing.T) { 162 defer leaktest.AfterTest(t)() 163 164 // First prevent rot that would result from adding fields without handling 165 // them everywhere. 166 { 167 act := fmt.Sprintf("%+v", result.Metrics{}) 168 exp := "{LeaseRequestSuccess:0 LeaseRequestError:0 LeaseTransferSuccess:0 LeaseTransferError:0 ResolveCommit:0 ResolveAbort:0 ResolvePoison:0}" 169 if act != exp { 170 t.Errorf("need to update this test due to added fields: %v", act) 171 } 172 } 173 174 mtc := &multiTestContext{} 175 defer mtc.Stop() 176 mtc.Start(t, 1) 177 178 ctx := context.Background() 179 180 span := roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")} 181 182 txn := roachpb.MakeTransaction("foo", span.Key, roachpb.MinUserPriority, hlc.Timestamp{WallTime: 123}, 999) 183 184 const resolveCommitCount = int64(200) 185 const resolveAbortCount = int64(800) 186 const resolvePoisonCount = int64(2400) 187 188 var ba roachpb.BatchRequest 189 { 190 repl := mtc.stores[0].LookupReplica(keys.MustAddr(span.Key)) 191 var err error 192 if ba.Replica, err = repl.GetReplicaDescriptor(); err != nil { 193 t.Fatal(err) 194 } 195 ba.RangeID = repl.RangeID 196 } 197 198 add := func(status roachpb.TransactionStatus, poison bool, n int64) { 199 for i := int64(0); i < n; i++ { 200 key := span.Key 201 endKey := span.EndKey 202 if i > n/2 { 203 req := &roachpb.ResolveIntentRangeRequest{ 204 IntentTxn: txn.TxnMeta, 205 Status: status, 206 Poison: poison, 207 } 208 req.Key, req.EndKey = key, endKey 209 ba.Add(req) 210 continue 211 } 212 req := &roachpb.ResolveIntentRequest{ 213 IntentTxn: txn.TxnMeta, 214 Status: status, 215 Poison: poison, 216 } 217 req.Key = key 218 ba.Add(req) 219 } 220 } 221 222 add(roachpb.COMMITTED, false, resolveCommitCount) 223 add(roachpb.ABORTED, false, resolveAbortCount) 224 add(roachpb.ABORTED, true, resolvePoisonCount) 225 226 if _, pErr := mtc.senders[0].Send(ctx, ba); pErr != nil { 227 t.Fatal(pErr) 228 } 229 230 if exp, act := resolveCommitCount, mtc.stores[0].Metrics().ResolveCommitCount.Count(); act < exp || act > exp+50 { 231 t.Errorf("expected around %d intent commits, saw %d", exp, act) 232 } 233 if exp, act := resolveAbortCount, mtc.stores[0].Metrics().ResolveAbortCount.Count(); act < exp || act > exp+50 { 234 t.Errorf("expected around %d intent aborts, saw %d", exp, act) 235 } 236 if exp, act := resolvePoisonCount, mtc.stores[0].Metrics().ResolvePoisonCount.Count(); act < exp || act > exp+50 { 237 t.Errorf("expected arounc %d abort span poisonings, saw %d", exp, act) 238 } 239 } 240 241 func TestStoreMetrics(t *testing.T) { 242 defer leaktest.AfterTest(t)() 243 244 storeCfg := kvserver.TestStoreConfig(nil /* clock */) 245 storeCfg.TestingKnobs.DisableMergeQueue = true 246 mtc := &multiTestContext{ 247 storeConfig: &storeCfg, 248 // This test was written before the multiTestContext started creating many 249 // system ranges at startup, and hasn't been update to take that into 250 // account. 251 startWithSingleRange: true, 252 } 253 defer mtc.Stop() 254 mtc.Start(t, 3) 255 256 // Flush RocksDB memtables, so that RocksDB begins using block-based tables. 257 // This is useful, because most of the stats we track don't apply to 258 // memtables. 259 if err := mtc.stores[0].Engine().Flush(); err != nil { 260 t.Fatal(err) 261 } 262 if err := mtc.stores[1].Engine().Flush(); err != nil { 263 t.Fatal(err) 264 } 265 266 // Disable the raft log truncation which confuses this test. 267 for _, s := range mtc.stores { 268 s.SetRaftLogQueueActive(false) 269 } 270 271 // Perform a split, which has special metrics handling. 272 splitArgs := adminSplitArgs(roachpb.Key("m")) 273 if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil { 274 t.Fatal(err) 275 } 276 277 // Verify range count is as expected 278 checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 2) 279 280 // Verify all stats on store0 after split. 281 verifyStats(t, mtc, 0) 282 283 // Replicate the "right" range to the other stores. 284 replica := mtc.stores[0].LookupReplica(roachpb.RKey("z")) 285 mtc.replicateRange(replica.RangeID, 1, 2) 286 287 // Verify stats on store1 after replication. 288 verifyStats(t, mtc, 1) 289 290 // Add some data to the "right" range. 291 dataKey := []byte("z") 292 if _, err := mtc.dbs[0].Inc(context.Background(), dataKey, 5); err != nil { 293 t.Fatal(err) 294 } 295 mtc.waitForValues(roachpb.Key("z"), []int64{5, 5, 5}) 296 297 // Verify all stats on stores after addition. 298 verifyStats(t, mtc, 0, 1, 2) 299 300 // Create a transaction statement that fails. Regression test for #4969. 301 if err := mtc.dbs[0].Txn(context.Background(), func(ctx context.Context, txn *kv.Txn) error { 302 b := txn.NewBatch() 303 var expVal roachpb.Value 304 expVal.SetInt(6) 305 b.CPut(dataKey, 7, &expVal) 306 return txn.Run(ctx, b) 307 }); err == nil { 308 t.Fatal("Expected transaction error, but none received") 309 } 310 311 // Verify stats after addition. 312 verifyStats(t, mtc, 0, 1, 2) 313 checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 2) 314 315 // Unreplicate range from the first store. 316 testutils.SucceedsSoon(t, func() error { 317 // This statement can fail if store 0 is not the leaseholder. 318 if err := mtc.transferLeaseNonFatal(context.Background(), replica.RangeID, 0, 1); err != nil { 319 t.Log(err) 320 } 321 // This statement will fail if store 0 IS the leaseholder. This can happen 322 // even after the previous statement. 323 return mtc.unreplicateRangeNonFatal(replica.RangeID, 0) 324 }) 325 326 // Wait until we're sure that store 0 has successfully processed its removal. 327 require.NoError(t, mtc.waitForUnreplicated(replica.RangeID, 0)) 328 329 mtc.waitForValues(roachpb.Key("z"), []int64{0, 5, 5}) 330 331 // Verify range count is as expected. 332 checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 1) 333 checkGauge(t, "store 1", mtc.stores[1].Metrics().ReplicaCount, 1) 334 335 // Verify all stats on all stores after range is removed. 336 verifyStats(t, mtc, 0, 1, 2) 337 338 verifyRocksDBStats(t, mtc.stores[0]) 339 verifyRocksDBStats(t, mtc.stores[1]) 340 } 341 342 // TestStoreMaxBehindNanosOnlyTracksEpochBasedLeases ensures that the metric 343 // ClosedTimestampMaxBehindNanos does not follow the start time of expiration 344 // based leases. Expiration based leases don't publish closed timestamps. 345 func TestStoreMaxBehindNanosOnlyTracksEpochBasedLeases(t *testing.T) { 346 defer leaktest.AfterTest(t)() 347 348 ctx := context.Background() 349 tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ 350 ServerArgs: base.TestServerArgs{ 351 // Set a long timeout so that no lease or liveness ever times out. 352 RaftConfig: base.RaftConfig{RaftElectionTimeoutTicks: 100}, 353 }, 354 }) 355 defer tc.Stopper().Stop(ctx) 356 tdb := sqlutils.MakeSQLRunner(tc.ServerConn(0)) 357 // We want to choose setting values such that this test doesn't take too long 358 // with the caveat that under extreme stress, we need to make sure that the 359 // subsystem remains live. 360 const closedTimestampDuration = 15 * time.Millisecond 361 const closedTimestampFraction = 1 362 tdb.Exec(t, "SET CLUSTER SETTING kv.closed_timestamp.target_duration = $1", 363 closedTimestampDuration.String()) 364 tdb.Exec(t, "SET CLUSTER SETTING kv.closed_timestamp.close_fraction = $1", 365 closedTimestampFraction) 366 367 // Let's get to a point where we know that we have an expiration based lease 368 // with a start time more than some time ago and then we have a max closed 369 // value more recent. 370 _, meta2Repl1 := getFirstStoreReplica(t, tc.Server(0), keys.Meta2Prefix) 371 372 // Transfer the lease for the meta range to ensure that it has a non-zero 373 // start time. 374 require.NoError(t, tc.TransferRangeLease(*meta2Repl1.Desc(), tc.Target(1))) 375 376 testutils.SucceedsSoon(t, func() error { 377 _, metaRepl := getFirstStoreReplica(t, tc.Server(1), keys.Meta2Prefix) 378 l, _ := metaRepl.GetLease() 379 if l.Start == (hlc.Timestamp{}) { 380 return errors.Errorf("don't have a lease for meta1 yet: %v %v", l, meta2Repl1) 381 } 382 sinceExpBasedLeaseStart := timeutil.Since(timeutil.Unix(0, l.Start.WallTime)) 383 for i := 0; i < tc.NumServers(); i++ { 384 s, _ := getFirstStoreReplica(t, tc.Server(i), keys.Meta1Prefix) 385 require.NoError(t, s.ComputeMetrics(ctx, 0)) 386 maxBehind := time.Duration(s.Metrics().ClosedTimestampMaxBehindNanos.Value()) 387 // We want to make sure that maxBehind ends up being much smaller than the 388 // start of an expiration based lease. 389 const behindMultiple = 5 390 if maxBehind*behindMultiple > sinceExpBasedLeaseStart { 391 return errors.Errorf("store %v has a ClosedTimestampMaxBehindNanos"+ 392 " of %v which is not way less than the an expiration-based lease start, %v", 393 s.StoreID(), maxBehind, sinceExpBasedLeaseStart) 394 } 395 } 396 return nil 397 }) 398 }