github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/node_test.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package server 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "reflect" 18 "sort" 19 "testing" 20 21 "github.com/cockroachdb/cockroach/pkg/base" 22 "github.com/cockroachdb/cockroach/pkg/clusterversion" 23 "github.com/cockroachdb/cockroach/pkg/config" 24 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 25 "github.com/cockroachdb/cockroach/pkg/gossip" 26 "github.com/cockroachdb/cockroach/pkg/keys" 27 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 28 "github.com/cockroachdb/cockroach/pkg/roachpb" 29 "github.com/cockroachdb/cockroach/pkg/server/status/statuspb" 30 "github.com/cockroachdb/cockroach/pkg/storage" 31 "github.com/cockroachdb/cockroach/pkg/testutils" 32 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 33 "github.com/cockroachdb/cockroach/pkg/util/hlc" 34 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 35 "github.com/cockroachdb/cockroach/pkg/util/uuid" 36 "github.com/cockroachdb/errors" 37 "github.com/stretchr/testify/require" 38 ) 39 40 func formatKeys(keys []roachpb.Key) string { 41 var buf bytes.Buffer 42 for i, key := range keys { 43 fmt.Fprintf(&buf, "%d: %s\n", i, key) 44 } 45 return buf.String() 46 } 47 48 // keySlice implements sort.Interface. 49 type keySlice []roachpb.Key 50 51 func (s keySlice) Len() int { return len(s) } 52 func (s keySlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 53 func (s keySlice) Less(i, j int) bool { return bytes.Compare(s[i], s[j]) < 0 } 54 55 // TestBootstrapCluster verifies the results of bootstrapping a 56 // cluster. Uses an in memory engine. 57 func TestBootstrapCluster(t *testing.T) { 58 defer leaktest.AfterTest(t)() 59 ctx := context.Background() 60 e := storage.NewDefaultInMem() 61 defer e.Close() 62 require.NoError(t, kvserver.WriteClusterVersion(ctx, e, clusterversion.TestingClusterVersion)) 63 if _, err := bootstrapCluster( 64 ctx, []storage.Engine{e}, zonepb.DefaultZoneConfigRef(), zonepb.DefaultSystemZoneConfigRef(), 65 ); err != nil { 66 t.Fatal(err) 67 } 68 69 // Scan the complete contents of the local database directly from the engine. 70 res, err := storage.MVCCScan(ctx, e, keys.LocalMax, roachpb.KeyMax, hlc.MaxTimestamp, storage.MVCCScanOptions{}) 71 if err != nil { 72 t.Fatal(err) 73 } 74 var foundKeys keySlice 75 for _, kv := range res.KVs { 76 foundKeys = append(foundKeys, kv.Key) 77 } 78 var expectedKeys = keySlice{ 79 testutils.MakeKey(roachpb.Key("\x02"), roachpb.KeyMax), 80 testutils.MakeKey(roachpb.Key("\x03"), roachpb.KeyMax), 81 roachpb.Key("\x04bootstrap-version"), 82 roachpb.Key("\x04node-idgen"), 83 roachpb.Key("\x04range-idgen"), 84 roachpb.Key("\x04store-idgen"), 85 } 86 for _, splitKey := range config.StaticSplits() { 87 meta2Key := keys.RangeMetaKey(splitKey) 88 expectedKeys = append(expectedKeys, meta2Key.AsRawKey()) 89 } 90 91 // Add the initial keys for sql. 92 kvs, tableSplits := GetBootstrapSchema( 93 zonepb.DefaultZoneConfigRef(), zonepb.DefaultSystemZoneConfigRef(), 94 ).GetInitialValues() 95 for _, kv := range kvs { 96 expectedKeys = append(expectedKeys, kv.Key) 97 } 98 for _, splitKey := range tableSplits { 99 meta2Key := keys.RangeMetaKey(splitKey) 100 expectedKeys = append(expectedKeys, meta2Key.AsRawKey()) 101 } 102 103 // Resort the list. The sql values are not sorted. 104 sort.Sort(expectedKeys) 105 106 if !reflect.DeepEqual(foundKeys, expectedKeys) { 107 t.Errorf("expected keys mismatch (found vs expected):\n%s\n -- vs. -- \n\n%s", 108 formatKeys(foundKeys), formatKeys(expectedKeys)) 109 } 110 111 // TODO(spencer): check values. 112 } 113 114 // TestBootstrapNewStore starts a cluster with two unbootstrapped 115 // stores and verifies both stores are added and started. 116 func TestBootstrapNewStore(t *testing.T) { 117 defer leaktest.AfterTest(t)() 118 ctx := context.Background() 119 120 path, cleanup := testutils.TempDir(t) 121 defer cleanup() 122 123 // Start server with persisted store so that it gets bootstrapped. 124 { 125 s, _, _ := serverutils.StartServer(t, base.TestServerArgs{ 126 StoreSpecs: []base.StoreSpec{ 127 {Path: path}, 128 }, 129 }) 130 s.Stopper().Stop(ctx) 131 } 132 133 specs := []base.StoreSpec{ 134 {Path: path}, 135 {InMemory: true}, 136 {InMemory: true}, 137 } 138 s, _, _ := serverutils.StartServer(t, base.TestServerArgs{ 139 StoreSpecs: specs, 140 }) 141 defer s.Stopper().Stop(ctx) 142 143 // Check whether all stores are started properly. 144 testutils.SucceedsSoon(t, func() error { 145 var n int 146 err := s.GetStores().(*kvserver.Stores).VisitStores(func(s *kvserver.Store) error { 147 if !s.IsStarted() { 148 return fmt.Errorf("not started: %s", s) 149 } 150 n++ 151 return nil 152 }) 153 if err != nil { 154 return err 155 } 156 if exp := len(specs); exp != n { 157 return fmt.Errorf("found only %d of %d stores", n, exp) 158 } 159 return nil 160 }) 161 } 162 163 // TestNodeJoin verifies a new node is able to join a bootstrapped 164 // cluster consisting of one node. 165 func TestNodeJoin(t *testing.T) { 166 defer leaktest.AfterTest(t)() 167 ctx := context.Background() 168 169 // For kicks, start both nodes in the cluster with two initially empty 170 // engines. The first node is expected to bootstrap itself, so the second 171 // one will join the first. 172 perNode := map[int]base.TestServerArgs{} 173 perNode[0] = base.TestServerArgs{ 174 StoreSpecs: []base.StoreSpec{ 175 {InMemory: true}, 176 {InMemory: true}, 177 }, 178 } 179 perNode[1] = perNode[0] 180 181 args := base.TestClusterArgs{ 182 ReplicationMode: base.ReplicationManual, // saves time in this test 183 ServerArgsPerNode: perNode, 184 } 185 186 numNodes := len(perNode) 187 188 s := serverutils.StartTestCluster(t, numNodes, args) 189 defer s.Stopper().Stop(ctx) 190 191 // Verify all stores are initialized. 192 for i := 0; i < numNodes; i++ { 193 testutils.SucceedsSoon(t, func() error { 194 exp := len(perNode[i].StoreSpecs) 195 sc := s.Server(i).GetStores().(*kvserver.Stores).GetStoreCount() 196 if sc != exp { 197 return errors.Errorf("%d: saw only %d out of %d stores", i, sc, exp) 198 } 199 return nil 200 }) 201 } 202 203 // Verify node1 sees node2 via gossip and vice versa. 204 node1Key := gossip.MakeNodeIDKey(s.Server(0).NodeID()) 205 node2Key := gossip.MakeNodeIDKey(s.Server(1).NodeID()) 206 server1Addr := s.Server(0).ServingRPCAddr() 207 server2Addr := s.Server(1).ServingRPCAddr() 208 testutils.SucceedsSoon(t, func() error { 209 var nodeDesc1 roachpb.NodeDescriptor 210 if err := s.Server(0).GossipI().(*gossip.Gossip).GetInfoProto(node2Key, &nodeDesc1); err != nil { 211 return err 212 } 213 if addr2Str, server2AddrStr := nodeDesc1.Address.String(), server2Addr; addr2Str != server2AddrStr { 214 return errors.Errorf("addr2 gossip %s doesn't match addr2 address %s", addr2Str, server2AddrStr) 215 } 216 var nodeDesc2 roachpb.NodeDescriptor 217 if err := s.Server(1).GossipI().(*gossip.Gossip).GetInfoProto(node1Key, &nodeDesc2); err != nil { 218 return err 219 } 220 if addr1Str, server1AddrStr := nodeDesc2.Address.String(), server1Addr; addr1Str != server1AddrStr { 221 return errors.Errorf("addr1 gossip %s doesn't match addr1 address %s", addr1Str, server1AddrStr) 222 } 223 return nil 224 }) 225 } 226 227 // TestCorruptedClusterID verifies that a node fails to start when a 228 // store's cluster ID is empty. 229 func TestCorruptedClusterID(t *testing.T) { 230 defer leaktest.AfterTest(t)() 231 232 ctx := context.Background() 233 e := storage.NewDefaultInMem() 234 defer e.Close() 235 236 cv := clusterversion.TestingClusterVersion 237 238 require.NoError(t, kvserver.WriteClusterVersion(ctx, e, cv)) 239 if _, err := bootstrapCluster( 240 ctx, []storage.Engine{e}, zonepb.DefaultZoneConfigRef(), zonepb.DefaultSystemZoneConfigRef(), 241 ); err != nil { 242 t.Fatal(err) 243 } 244 245 // Set the cluster ID to the empty UUID. 246 sIdent := roachpb.StoreIdent{ 247 ClusterID: uuid.UUID{}, 248 NodeID: 1, 249 StoreID: 1, 250 } 251 if err := storage.MVCCPutProto( 252 ctx, e, nil /* ms */, keys.StoreIdentKey(), hlc.Timestamp{}, nil /* txn */, &sIdent, 253 ); err != nil { 254 t.Fatal(err) 255 } 256 257 _, err := inspectEngines(ctx, []storage.Engine{e}, cv.Version, cv.Version) 258 if !testutils.IsError(err, `partially initialized`) { 259 t.Fatal(err) 260 } 261 } 262 263 // compareNodeStatus ensures that the actual node status for the passed in 264 // node is updated correctly. It checks that the Node Descriptor, StoreIDs, 265 // RangeCount, StartedAt, ReplicatedRangeCount and are exactly correct and that 266 // the bytes and counts for Live, Key and Val are at least the expected value. 267 // And that UpdatedAt has increased. 268 // The latest actual stats are returned. 269 func compareNodeStatus( 270 t *testing.T, ts *TestServer, expectedNodeStatus *statuspb.NodeStatus, testNumber int, 271 ) *statuspb.NodeStatus { 272 // ======================================== 273 // Read NodeStatus from server and validate top-level fields. 274 // ======================================== 275 nodeStatusKey := keys.NodeStatusKey(ts.node.Descriptor.NodeID) 276 nodeStatus := &statuspb.NodeStatus{} 277 if err := ts.db.GetProto(context.Background(), nodeStatusKey, nodeStatus); err != nil { 278 t.Fatalf("%d: failure getting node status: %s", testNumber, err) 279 } 280 281 // Descriptor values should be exactly equal to expected. 282 if a, e := nodeStatus.Desc, expectedNodeStatus.Desc; !reflect.DeepEqual(a, e) { 283 t.Errorf("%d: Descriptor does not match expected.\nexpected: %s\nactual: %s", testNumber, &e, &a) 284 } 285 286 // ======================================== 287 // Ensure all expected stores are represented in the node status. 288 // ======================================== 289 storesToMap := func(ns *statuspb.NodeStatus) map[roachpb.StoreID]statuspb.StoreStatus { 290 strMap := make(map[roachpb.StoreID]statuspb.StoreStatus, len(ns.StoreStatuses)) 291 for _, str := range ns.StoreStatuses { 292 strMap[str.Desc.StoreID] = str 293 } 294 return strMap 295 } 296 actualStores := storesToMap(nodeStatus) 297 expectedStores := storesToMap(expectedNodeStatus) 298 299 if a, e := len(actualStores), len(expectedStores); a != e { 300 t.Errorf("%d: actual status contained %d stores, expected %d", testNumber, a, e) 301 } 302 for key := range expectedStores { 303 if _, ok := actualStores[key]; !ok { 304 t.Errorf("%d: actual node status did not contain expected store %d", testNumber, key) 305 } 306 } 307 if t.Failed() { 308 t.FailNow() 309 } 310 311 // ======================================== 312 // Ensure all metric sets (node and store level) are consistent with 313 // expected status. 314 // ======================================== 315 316 // CompareMetricMaps accepts an actual and expected metric maps, along with 317 // two lists of string keys. For metrics with keys in the 'equal' map, the 318 // actual value must be equal to the expected value. For keys in the 319 // 'greater' map, the actual value must be greater than or equal to the 320 // expected value. 321 compareMetricMaps := func(actual, expected map[string]float64, equal, greater []string) { 322 // Make sure the actual value map contains all values in expected map. 323 for key := range expected { 324 if _, ok := actual[key]; !ok { 325 t.Errorf("%d: actual node status did not contain expected metric %s", testNumber, key) 326 } 327 } 328 if t.Failed() { 329 return 330 } 331 332 // For each equal key, ensure that the actual value is equal to expected 333 // key. 334 for _, key := range equal { 335 if _, ok := actual[key]; !ok { 336 t.Errorf("%d, actual node status did not contain expected 'equal' metric key %s", testNumber, key) 337 continue 338 } 339 if a, e := actual[key], expected[key]; a != e { 340 t.Errorf("%d: %s does not match expected value.\nExpected %f, Actual %f", testNumber, key, e, a) 341 } 342 } 343 for _, key := range greater { 344 if _, ok := actual[key]; !ok { 345 t.Errorf("%d: actual node status did not contain expected 'greater' metric key %s", testNumber, key) 346 continue 347 } 348 if a, e := actual[key], expected[key]; a < e { 349 t.Errorf("%d: %s is not greater than or equal to expected value.\nExpected %f, Actual %f", testNumber, key, e, a) 350 } 351 } 352 } 353 354 compareMetricMaps(nodeStatus.Metrics, expectedNodeStatus.Metrics, nil, []string{ 355 "exec.success", 356 "exec.error", 357 }) 358 359 for key := range actualStores { 360 // Directly verify a subset of metrics which have predictable output. 361 compareMetricMaps(actualStores[key].Metrics, expectedStores[key].Metrics, 362 []string{ 363 "replicas", 364 "replicas.leaseholders", 365 }, 366 []string{ 367 "livecount", 368 "keycount", 369 "valcount", 370 }) 371 } 372 373 if t.Failed() { 374 t.FailNow() 375 } 376 377 return nodeStatus 378 } 379 380 // TestNodeStatusWritten verifies that status summaries are written correctly for 381 // both the Node and stores within the node. 382 func TestNodeStatusWritten(t *testing.T) { 383 defer leaktest.AfterTest(t)() 384 385 // ======================================== 386 // Start test server and wait for full initialization. 387 // ======================================== 388 srv, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{ 389 DisableEventLog: true, 390 }) 391 defer srv.Stopper().Stop(context.Background()) 392 ts := srv.(*TestServer) 393 ctx := context.Background() 394 395 // Retrieve the first store from the Node. 396 s, err := ts.node.stores.GetStore(roachpb.StoreID(1)) 397 if err != nil { 398 t.Fatal(err) 399 } 400 401 s.WaitForInit() 402 403 content := "junk" 404 leftKey := "a" 405 406 // Scan over all keys to "wake up" all replicas (force a lease holder election). 407 if _, err := kvDB.Scan(context.Background(), keys.MetaMax, keys.MaxKey, 0); err != nil { 408 t.Fatal(err) 409 } 410 411 // Wait for full replication of initial ranges. 412 initialRanges, err := ExpectedInitialRangeCount(kvDB, &ts.cfg.DefaultZoneConfig, &ts.cfg.DefaultSystemZoneConfig) 413 if err != nil { 414 t.Fatal(err) 415 } 416 testutils.SucceedsSoon(t, func() error { 417 for i := 1; i <= initialRanges; i++ { 418 if s.RaftStatus(roachpb.RangeID(i)) == nil { 419 return errors.Errorf("Store %d replica %d is not present in raft", s.StoreID(), i) 420 } 421 } 422 return nil 423 }) 424 425 // ======================================== 426 // Construct an initial expectation for NodeStatus to compare to the first 427 // status produced by the server. 428 // ======================================== 429 expectedNodeStatus := &statuspb.NodeStatus{ 430 Desc: ts.node.Descriptor, 431 StartedAt: 0, 432 UpdatedAt: 0, 433 Metrics: map[string]float64{ 434 "exec.success": 0, 435 "exec.error": 0, 436 }, 437 } 438 439 expectedStoreStatuses := make(map[roachpb.StoreID]statuspb.StoreStatus) 440 if err := ts.node.stores.VisitStores(func(s *kvserver.Store) error { 441 desc, err := s.Descriptor(false /* useCached */) 442 if err != nil { 443 t.Fatal(err) 444 } 445 expectedReplicas := 0 446 if s.StoreID() == roachpb.StoreID(1) { 447 expectedReplicas = initialRanges 448 } 449 stat := statuspb.StoreStatus{ 450 Desc: *desc, 451 Metrics: map[string]float64{ 452 "replicas": float64(expectedReplicas), 453 "replicas.leaseholders": float64(expectedReplicas), 454 "livebytes": 0, 455 "keybytes": 0, 456 "valbytes": 0, 457 "livecount": 0, 458 "keycount": 0, 459 "valcount": 0, 460 }, 461 } 462 expectedNodeStatus.StoreStatuses = append(expectedNodeStatus.StoreStatuses, stat) 463 expectedStoreStatuses[s.StoreID()] = stat 464 return nil 465 }); err != nil { 466 t.Fatal(err) 467 } 468 469 // Function to force summaries to be written synchronously, including all 470 // data currently in the event pipeline. Only one of the stores has 471 // replicas, so there are no concerns related to quorum writes; if there 472 // were multiple replicas, more care would need to be taken in the initial 473 // syncFeed(). 474 forceWriteStatus := func() { 475 if err := ts.node.computePeriodicMetrics(ctx, 0); err != nil { 476 t.Fatalf("error publishing store statuses: %s", err) 477 } 478 479 if err := ts.WriteSummaries(); err != nil { 480 t.Fatalf("error writing summaries: %s", err) 481 } 482 } 483 484 // Verify initial status. 485 forceWriteStatus() 486 expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 1) 487 for _, s := range expectedNodeStatus.StoreStatuses { 488 expectedStoreStatuses[s.Desc.StoreID] = s 489 } 490 491 // ======================================== 492 // Put some data into the K/V store and confirm change to status. 493 // ======================================== 494 495 splitKey := "b" 496 rightKey := "c" 497 498 // Write some values left and right of the proposed split key. 499 if err := ts.db.Put(ctx, leftKey, content); err != nil { 500 t.Fatal(err) 501 } 502 if err := ts.db.Put(ctx, rightKey, content); err != nil { 503 t.Fatal(err) 504 } 505 506 // Increment metrics on the node 507 expectedNodeStatus.Metrics["exec.success"] += 2 508 509 // Increment metrics on the first store. 510 store1 := expectedStoreStatuses[roachpb.StoreID(1)].Metrics 511 store1["livecount"]++ 512 store1["keycount"]++ 513 store1["valcount"]++ 514 store1["livebytes"]++ 515 store1["keybytes"]++ 516 store1["valbytes"]++ 517 518 forceWriteStatus() 519 expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 2) 520 for _, s := range expectedNodeStatus.StoreStatuses { 521 expectedStoreStatuses[s.Desc.StoreID] = s 522 } 523 524 // ======================================== 525 // Perform an admin split and verify that status is updated. 526 // ======================================== 527 528 // Split the range. 529 if err := ts.db.AdminSplit(context.Background(), splitKey, splitKey, hlc.MaxTimestamp /* expirationTime */); err != nil { 530 t.Fatal(err) 531 } 532 533 // Write on both sides of the split to ensure that the raft machinery 534 // is running. 535 if err := ts.db.Put(ctx, leftKey, content); err != nil { 536 t.Fatal(err) 537 } 538 if err := ts.db.Put(ctx, rightKey, content); err != nil { 539 t.Fatal(err) 540 } 541 542 // Increment metrics on the node 543 expectedNodeStatus.Metrics["exec.success"] += 2 544 545 // Increment metrics on the first store. 546 store1 = expectedStoreStatuses[roachpb.StoreID(1)].Metrics 547 store1["replicas"]++ 548 store1["replicas.leaders"]++ 549 store1["replicas.leaseholders"]++ 550 store1["ranges"]++ 551 552 forceWriteStatus() 553 expectedNodeStatus = compareNodeStatus(t, ts, expectedNodeStatus, 3) 554 for _, s := range expectedNodeStatus.StoreStatuses { 555 expectedStoreStatuses[s.Desc.StoreID] = s 556 } 557 } 558 559 // TestStartNodeWithLocality creates a new node and store and starts them with a 560 // collection of different localities. 561 func TestStartNodeWithLocality(t *testing.T) { 562 defer leaktest.AfterTest(t)() 563 ctx := context.Background() 564 565 testLocalityWithNewNode := func(locality roachpb.Locality) { 566 args := base.TestServerArgs{ 567 Locality: locality, 568 } 569 s, _, _ := serverutils.StartServer(t, args) 570 defer s.Stopper().Stop(ctx) 571 572 // Check that the locality is present both on the Node and was also 573 // handed to each StoreDescriptor. 574 575 desc := s.Node().(*Node).Descriptor 576 if !reflect.DeepEqual(desc.Locality, locality) { 577 t.Fatalf("expected node locality to be %s, but it was %s", locality, desc.Locality) 578 } 579 580 if err := s.GetStores().(*kvserver.Stores).VisitStores(func(store *kvserver.Store) error { 581 desc, err := store.Descriptor(false /* useCached */) 582 if err != nil { 583 t.Fatal(err) 584 } 585 if !reflect.DeepEqual(desc.Node.Locality, locality) { 586 t.Fatalf("expected store's node locality to be %s, but it was %s", locality, desc.Node.Locality) 587 } 588 return nil 589 }); err != nil { 590 t.Fatal(err) 591 } 592 } 593 594 testCases := []roachpb.Locality{ 595 {}, 596 { 597 Tiers: []roachpb.Tier{ 598 {Key: "a", Value: "b"}, 599 }, 600 }, 601 { 602 Tiers: []roachpb.Tier{ 603 {Key: "a", Value: "b"}, 604 {Key: "c", Value: "d"}, 605 {Key: "e", Value: "f"}, 606 }, 607 }, 608 } 609 610 for _, testCase := range testCases { 611 testLocalityWithNewNode(testCase) 612 } 613 } 614 615 func TestNodeSendUnknownBatchRequest(t *testing.T) { 616 defer leaktest.AfterTest(t)() 617 618 ba := roachpb.BatchRequest{ 619 Requests: make([]roachpb.RequestUnion, 1), 620 } 621 n := &Node{} 622 br, err := n.batchInternal(context.Background(), &ba) 623 if err != nil { 624 t.Fatal(err) 625 } 626 if br.Error == nil { 627 t.Fatal("no batch error returned") 628 } 629 if _, ok := br.Error.GetDetail().(*roachpb.UnsupportedRequestError); !ok { 630 t.Fatalf("expected unsupported request, not %v", br.Error) 631 } 632 }