github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_pool_test.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "math/rand" 17 "reflect" 18 "sort" 19 "testing" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/base" 23 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 24 "github.com/cockroachdb/cockroach/pkg/gossip" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/rpc" 28 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 29 "github.com/cockroachdb/cockroach/pkg/storage" 30 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 31 "github.com/cockroachdb/cockroach/pkg/testutils/gossiputil" 32 "github.com/cockroachdb/cockroach/pkg/util/hlc" 33 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 34 "github.com/cockroachdb/cockroach/pkg/util/log" 35 "github.com/cockroachdb/cockroach/pkg/util/metric" 36 "github.com/cockroachdb/cockroach/pkg/util/stop" 37 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 38 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 39 "github.com/cockroachdb/cockroach/pkg/util/uuid" 40 "github.com/cockroachdb/errors" 41 "github.com/kr/pretty" 42 ) 43 44 var uniqueStore = []*roachpb.StoreDescriptor{ 45 { 46 StoreID: 2, 47 Attrs: roachpb.Attributes{Attrs: []string{"ssd"}}, 48 Node: roachpb.NodeDescriptor{ 49 NodeID: 2, 50 Attrs: roachpb.Attributes{Attrs: []string{"a"}}, 51 }, 52 Capacity: roachpb.StoreCapacity{ 53 Capacity: 100, 54 Available: 200, 55 }, 56 }, 57 } 58 59 type mockNodeLiveness struct { 60 syncutil.Mutex 61 defaultNodeStatus kvserverpb.NodeLivenessStatus 62 nodes map[roachpb.NodeID]kvserverpb.NodeLivenessStatus 63 } 64 65 func newMockNodeLiveness(defaultNodeStatus kvserverpb.NodeLivenessStatus) *mockNodeLiveness { 66 return &mockNodeLiveness{ 67 defaultNodeStatus: defaultNodeStatus, 68 nodes: map[roachpb.NodeID]kvserverpb.NodeLivenessStatus{}, 69 } 70 } 71 72 func (m *mockNodeLiveness) setNodeStatus( 73 nodeID roachpb.NodeID, status kvserverpb.NodeLivenessStatus, 74 ) { 75 m.Lock() 76 defer m.Unlock() 77 m.nodes[nodeID] = status 78 } 79 80 func (m *mockNodeLiveness) nodeLivenessFunc( 81 nodeID roachpb.NodeID, now time.Time, threshold time.Duration, 82 ) kvserverpb.NodeLivenessStatus { 83 m.Lock() 84 defer m.Unlock() 85 if status, ok := m.nodes[nodeID]; ok { 86 return status 87 } 88 return m.defaultNodeStatus 89 } 90 91 // createTestStorePool creates a stopper, gossip and storePool for use in 92 // tests. Stopper must be stopped by the caller. 93 func createTestStorePool( 94 timeUntilStoreDeadValue time.Duration, 95 deterministic bool, 96 nodeCount NodeCountFunc, 97 defaultNodeStatus kvserverpb.NodeLivenessStatus, 98 ) (*stop.Stopper, *gossip.Gossip, *hlc.ManualClock, *StorePool, *mockNodeLiveness) { 99 stopper := stop.NewStopper() 100 mc := hlc.NewManualClock(123) 101 clock := hlc.NewClock(mc.UnixNano, time.Nanosecond) 102 st := cluster.MakeTestingClusterSettings() 103 rpcContext := rpc.NewContext( 104 log.AmbientContext{Tracer: st.Tracer}, &base.Config{Insecure: true}, clock, stopper, st) 105 server := rpc.NewServer(rpcContext) // never started 106 g := gossip.NewTest(1, rpcContext, server, stopper, metric.NewRegistry(), zonepb.DefaultZoneConfigRef()) 107 mnl := newMockNodeLiveness(defaultNodeStatus) 108 109 TimeUntilStoreDead.Override(&st.SV, timeUntilStoreDeadValue) 110 storePool := NewStorePool( 111 log.AmbientContext{Tracer: st.Tracer}, 112 st, 113 g, 114 clock, 115 nodeCount, 116 mnl.nodeLivenessFunc, 117 deterministic, 118 ) 119 return stopper, g, mc, storePool, mnl 120 } 121 122 // TestStorePoolGossipUpdate ensures that the gossip callback in StorePool 123 // correctly updates a store's details. 124 func TestStorePoolGossipUpdate(t *testing.T) { 125 defer leaktest.AfterTest(t)() 126 stopper, g, _, sp, _ := createTestStorePool( 127 TestTimeUntilStoreDead, false, /* deterministic */ 128 func() int { return 0 }, /* NodeCount */ 129 kvserverpb.NodeLivenessStatus_DEAD) 130 defer stopper.Stop(context.Background()) 131 sg := gossiputil.NewStoreGossiper(g) 132 133 sp.detailsMu.RLock() 134 if _, ok := sp.detailsMu.storeDetails[2]; ok { 135 t.Fatalf("store 2 is already in the pool's store list") 136 } 137 sp.detailsMu.RUnlock() 138 139 sg.GossipStores(uniqueStore, t) 140 141 sp.detailsMu.RLock() 142 if _, ok := sp.detailsMu.storeDetails[2]; !ok { 143 t.Fatalf("store 2 isn't in the pool's store list") 144 } 145 sp.detailsMu.RUnlock() 146 } 147 148 // verifyStoreList ensures that the returned list of stores is correct. 149 func verifyStoreList( 150 sp *StorePool, 151 constraints []zonepb.ConstraintsConjunction, 152 storeIDs roachpb.StoreIDSlice, // optional 153 filter storeFilter, 154 expected []int, 155 expectedAliveStoreCount int, 156 expectedThrottledStoreCount int, 157 ) error { 158 var sl StoreList 159 var aliveStoreCount int 160 var throttled throttledStoreReasons 161 if storeIDs == nil { 162 sl, aliveStoreCount, throttled = sp.getStoreList(filter) 163 } else { 164 sl, aliveStoreCount, throttled = sp.getStoreListFromIDs(storeIDs, filter) 165 } 166 throttledStoreCount := len(throttled) 167 sl = sl.filter(constraints) 168 if aliveStoreCount != expectedAliveStoreCount { 169 return errors.Errorf("expected AliveStoreCount %d does not match actual %d", 170 expectedAliveStoreCount, aliveStoreCount) 171 } 172 if throttledStoreCount != expectedThrottledStoreCount { 173 return errors.Errorf("expected ThrottledStoreCount %d does not match actual %d", 174 expectedThrottledStoreCount, throttledStoreCount) 175 } 176 var actual []int 177 for _, store := range sl.stores { 178 actual = append(actual, int(store.StoreID)) 179 } 180 sort.Ints(expected) 181 sort.Ints(actual) 182 if !reflect.DeepEqual(expected, actual) { 183 return errors.Errorf("expected %+v stores, actual %+v", expected, actual) 184 } 185 return nil 186 } 187 188 // TestStorePoolGetStoreList ensures that the store list returns only stores 189 // that are live and match the attribute criteria. 190 func TestStorePoolGetStoreList(t *testing.T) { 191 defer leaktest.AfterTest(t)() 192 // We're going to manually mark stores dead in this test. 193 stopper, g, _, sp, mnl := createTestStorePool( 194 TestTimeUntilStoreDead, false, /* deterministic */ 195 func() int { return 10 }, /* nodeCount */ 196 kvserverpb.NodeLivenessStatus_DEAD) 197 defer stopper.Stop(context.Background()) 198 sg := gossiputil.NewStoreGossiper(g) 199 constraints := []zonepb.ConstraintsConjunction{ 200 { 201 Constraints: []zonepb.Constraint{ 202 {Type: zonepb.Constraint_REQUIRED, Value: "ssd"}, 203 {Type: zonepb.Constraint_REQUIRED, Value: "dc"}, 204 }, 205 }, 206 } 207 required := []string{"ssd", "dc"} 208 // Nothing yet. 209 sl, _, _ := sp.getStoreList(storeFilterNone) 210 sl = sl.filter(constraints) 211 if len(sl.stores) != 0 { 212 t.Errorf("expected no stores, instead %+v", sl.stores) 213 } 214 215 matchingStore := roachpb.StoreDescriptor{ 216 StoreID: 1, 217 Node: roachpb.NodeDescriptor{NodeID: 1}, 218 Attrs: roachpb.Attributes{Attrs: required}, 219 } 220 supersetStore := roachpb.StoreDescriptor{ 221 StoreID: 2, 222 Node: roachpb.NodeDescriptor{NodeID: 2}, 223 Attrs: roachpb.Attributes{Attrs: append(required, "db")}, 224 } 225 unmatchingStore := roachpb.StoreDescriptor{ 226 StoreID: 3, 227 Node: roachpb.NodeDescriptor{NodeID: 3}, 228 Attrs: roachpb.Attributes{Attrs: []string{"ssd", "otherdc"}}, 229 } 230 emptyStore := roachpb.StoreDescriptor{ 231 StoreID: 4, 232 Node: roachpb.NodeDescriptor{NodeID: 4}, 233 Attrs: roachpb.Attributes{}, 234 } 235 deadStore := roachpb.StoreDescriptor{ 236 StoreID: 5, 237 Node: roachpb.NodeDescriptor{NodeID: 5}, 238 Attrs: roachpb.Attributes{Attrs: required}, 239 } 240 declinedStore := roachpb.StoreDescriptor{ 241 StoreID: 6, 242 Node: roachpb.NodeDescriptor{NodeID: 6}, 243 Attrs: roachpb.Attributes{Attrs: required}, 244 } 245 absentStore := roachpb.StoreDescriptor{ 246 StoreID: 7, 247 Node: roachpb.NodeDescriptor{NodeID: 7}, 248 Attrs: roachpb.Attributes{Attrs: required}, 249 } 250 251 // Gossip and mark all alive initially. 252 sg.GossipStores([]*roachpb.StoreDescriptor{ 253 &matchingStore, 254 &supersetStore, 255 &unmatchingStore, 256 &emptyStore, 257 &deadStore, 258 &declinedStore, 259 // absentStore is purposefully not gossiped. 260 }, t) 261 for i := 1; i <= 7; i++ { 262 mnl.setNodeStatus(roachpb.NodeID(i), kvserverpb.NodeLivenessStatus_LIVE) 263 } 264 265 // Set deadStore as dead. 266 mnl.setNodeStatus(deadStore.Node.NodeID, kvserverpb.NodeLivenessStatus_DEAD) 267 sp.detailsMu.Lock() 268 // Set declinedStore as throttled. 269 sp.detailsMu.storeDetails[declinedStore.StoreID].throttledUntil = sp.clock.Now().GoTime().Add(time.Hour) 270 sp.detailsMu.Unlock() 271 272 // No filter or limited set of store IDs. 273 if err := verifyStoreList( 274 sp, 275 constraints, 276 nil, /* storeIDs */ 277 storeFilterNone, 278 []int{ 279 int(matchingStore.StoreID), 280 int(supersetStore.StoreID), 281 int(declinedStore.StoreID), 282 }, 283 /* expectedAliveStoreCount */ 5, 284 /* expectedThrottledStoreCount */ 1, 285 ); err != nil { 286 t.Error(err) 287 } 288 289 // Filter out throttled stores but don't limit the set of store IDs. 290 if err := verifyStoreList( 291 sp, 292 constraints, 293 nil, /* storeIDs */ 294 storeFilterThrottled, 295 []int{ 296 int(matchingStore.StoreID), 297 int(supersetStore.StoreID), 298 }, 299 /* expectedAliveStoreCount */ 5, 300 /* expectedThrottledStoreCount */ 1, 301 ); err != nil { 302 t.Error(err) 303 } 304 305 limitToStoreIDs := roachpb.StoreIDSlice{ 306 matchingStore.StoreID, 307 declinedStore.StoreID, 308 absentStore.StoreID, 309 } 310 311 // No filter but limited to limitToStoreIDs. 312 // Note that supersetStore is not included. 313 if err := verifyStoreList( 314 sp, 315 constraints, 316 limitToStoreIDs, 317 storeFilterNone, 318 []int{ 319 int(matchingStore.StoreID), 320 int(declinedStore.StoreID), 321 }, 322 /* expectedAliveStoreCount */ 2, 323 /* expectedThrottledStoreCount */ 1, 324 ); err != nil { 325 t.Error(err) 326 } 327 328 // Filter out throttled stores and limit to limitToStoreIDs. 329 // Note that supersetStore is not included. 330 if err := verifyStoreList( 331 sp, 332 constraints, 333 limitToStoreIDs, 334 storeFilterThrottled, 335 []int{ 336 int(matchingStore.StoreID), 337 }, 338 /* expectedAliveStoreCount */ 2, 339 /* expectedThrottledStoreCount */ 1, 340 ); err != nil { 341 t.Error(err) 342 } 343 } 344 345 // TestStoreListFilter ensures that the store list constraint filtering works 346 // properly. 347 func TestStoreListFilter(t *testing.T) { 348 defer leaktest.AfterTest(t)() 349 350 constraints := []zonepb.ConstraintsConjunction{ 351 { 352 Constraints: []zonepb.Constraint{ 353 {Type: zonepb.Constraint_REQUIRED, Key: "region", Value: "us-west"}, 354 {Type: zonepb.Constraint_REQUIRED, Value: "MustMatch"}, 355 {Type: zonepb.Constraint_DEPRECATED_POSITIVE, Value: "MatchingOptional"}, 356 {Type: zonepb.Constraint_PROHIBITED, Value: "MustNotMatch"}, 357 }, 358 }, 359 } 360 361 stores := []struct { 362 attributes []string 363 locality []roachpb.Tier 364 expected bool 365 }{ 366 { 367 expected: false, 368 }, 369 { 370 attributes: []string{"MustMatch"}, 371 expected: false, 372 }, 373 { 374 locality: []roachpb.Tier{{Key: "region", Value: "us-west"}}, 375 expected: false, 376 }, 377 { 378 attributes: []string{"MustMatch"}, 379 locality: []roachpb.Tier{{Key: "region", Value: "us-west"}}, 380 expected: true, 381 }, 382 { 383 attributes: []string{"a", "MustMatch"}, 384 locality: []roachpb.Tier{{Key: "a", Value: "b"}, {Key: "region", Value: "us-west"}}, 385 expected: true, 386 }, 387 { 388 attributes: []string{"a", "b", "MustMatch", "c"}, 389 locality: []roachpb.Tier{{Key: "region", Value: "us-west"}, {Key: "c", Value: "d"}}, 390 expected: true, 391 }, 392 { 393 attributes: []string{"MustMatch", "MustNotMatch"}, 394 locality: []roachpb.Tier{{Key: "region", Value: "us-west"}}, 395 expected: false, 396 }, 397 { 398 attributes: []string{"MustMatch"}, 399 locality: []roachpb.Tier{{Key: "region", Value: "us-west"}, {Key: "MustNotMatch", Value: "b"}}, 400 expected: true, 401 }, 402 { 403 attributes: []string{"MustMatch"}, 404 locality: []roachpb.Tier{{Key: "region", Value: "us-west"}, {Key: "a", Value: "MustNotMatch"}}, 405 expected: true, 406 }, 407 } 408 409 var sl StoreList 410 var expected []roachpb.StoreDescriptor 411 for i, s := range stores { 412 storeDesc := roachpb.StoreDescriptor{ 413 StoreID: roachpb.StoreID(i + 1), 414 Node: roachpb.NodeDescriptor{ 415 Locality: roachpb.Locality{ 416 Tiers: s.locality, 417 }, 418 }, 419 } 420 // Randomly stick the attributes in either the node or the store to get 421 // code coverage of both locations. 422 if rand.Intn(2) == 0 { 423 storeDesc.Attrs.Attrs = s.attributes 424 } else { 425 storeDesc.Node.Attrs.Attrs = s.attributes 426 } 427 sl.stores = append(sl.stores, storeDesc) 428 if s.expected { 429 expected = append(expected, storeDesc) 430 } 431 } 432 433 filtered := sl.filter(constraints) 434 if !reflect.DeepEqual(expected, filtered.stores) { 435 t.Errorf("did not get expected stores %s", pretty.Diff(expected, filtered.stores)) 436 } 437 } 438 439 func TestStorePoolUpdateLocalStore(t *testing.T) { 440 defer leaktest.AfterTest(t)() 441 manual := hlc.NewManualClock(123) 442 clock := hlc.NewClock(manual.UnixNano, time.Nanosecond) 443 // We're going to manually mark stores dead in this test. 444 stopper, g, _, sp, _ := createTestStorePool( 445 TestTimeUntilStoreDead, false, /* deterministic */ 446 func() int { return 10 }, /* nodeCount */ 447 kvserverpb.NodeLivenessStatus_DEAD) 448 defer stopper.Stop(context.Background()) 449 sg := gossiputil.NewStoreGossiper(g) 450 stores := []*roachpb.StoreDescriptor{ 451 { 452 StoreID: 1, 453 Node: roachpb.NodeDescriptor{NodeID: 1}, 454 Capacity: roachpb.StoreCapacity{ 455 Capacity: 100, 456 Available: 50, 457 RangeCount: 5, 458 LeaseCount: 1, 459 LogicalBytes: 30, 460 QueriesPerSecond: 100, 461 WritesPerSecond: 30, 462 }, 463 }, 464 { 465 StoreID: 2, 466 Node: roachpb.NodeDescriptor{NodeID: 2}, 467 Capacity: roachpb.StoreCapacity{ 468 Capacity: 100, 469 Available: 55, 470 RangeCount: 4, 471 LeaseCount: 2, 472 LogicalBytes: 25, 473 QueriesPerSecond: 50, 474 WritesPerSecond: 25, 475 }, 476 }, 477 } 478 sg.GossipStores(stores, t) 479 480 replica := &Replica{RangeID: 1} 481 replica.mu.Lock() 482 replica.mu.state.Stats = &enginepb.MVCCStats{ 483 KeyBytes: 2, 484 ValBytes: 4, 485 } 486 replica.mu.Unlock() 487 rs := newReplicaStats(clock, nil) 488 for _, store := range stores { 489 rs.record(store.Node.NodeID) 490 } 491 manual.Increment(int64(MinStatsDuration + time.Second)) 492 replica.leaseholderStats = rs 493 replica.writeStats = rs 494 495 rangeUsageInfo := rangeUsageInfoForRepl(replica) 496 497 sp.updateLocalStoreAfterRebalance(roachpb.StoreID(1), rangeUsageInfo, roachpb.ADD_REPLICA) 498 desc, ok := sp.getStoreDescriptor(roachpb.StoreID(1)) 499 if !ok { 500 t.Fatalf("couldn't find StoreDescriptor for Store ID %d", 1) 501 } 502 QPS, _ := replica.leaseholderStats.avgQPS() 503 WPS, _ := replica.writeStats.avgQPS() 504 if expectedRangeCount := int32(6); desc.Capacity.RangeCount != expectedRangeCount { 505 t.Errorf("expected RangeCount %d, but got %d", expectedRangeCount, desc.Capacity.RangeCount) 506 } 507 if expectedBytes := int64(36); desc.Capacity.LogicalBytes != expectedBytes { 508 t.Errorf("expected logical bytes %d, but got %d", expectedBytes, desc.Capacity.LogicalBytes) 509 } 510 if expectedQPS := float64(100); desc.Capacity.QueriesPerSecond != expectedQPS { 511 t.Errorf("expected QueriesPerSecond %f, but got %f", expectedQPS, desc.Capacity.QueriesPerSecond) 512 } 513 if expectedWPS := 30 + WPS; desc.Capacity.WritesPerSecond != expectedWPS { 514 t.Errorf("expected WritesPerSecond %f, but got %f", expectedWPS, desc.Capacity.WritesPerSecond) 515 } 516 517 sp.updateLocalStoreAfterRebalance(roachpb.StoreID(2), rangeUsageInfo, roachpb.REMOVE_REPLICA) 518 desc, ok = sp.getStoreDescriptor(roachpb.StoreID(2)) 519 if !ok { 520 t.Fatalf("couldn't find StoreDescriptor for Store ID %d", 2) 521 } 522 if expectedRangeCount := int32(3); desc.Capacity.RangeCount != expectedRangeCount { 523 t.Errorf("expected RangeCount %d, but got %d", expectedRangeCount, desc.Capacity.RangeCount) 524 } 525 if expectedBytes := int64(19); desc.Capacity.LogicalBytes != expectedBytes { 526 t.Errorf("expected logical bytes %d, but got %d", expectedBytes, desc.Capacity.LogicalBytes) 527 } 528 if expectedQPS := float64(50); desc.Capacity.QueriesPerSecond != expectedQPS { 529 t.Errorf("expected QueriesPerSecond %f, but got %f", expectedQPS, desc.Capacity.QueriesPerSecond) 530 } 531 if expectedWPS := 25 - WPS; desc.Capacity.WritesPerSecond != expectedWPS { 532 t.Errorf("expected WritesPerSecond %f, but got %f", expectedWPS, desc.Capacity.WritesPerSecond) 533 } 534 535 sp.updateLocalStoresAfterLeaseTransfer(roachpb.StoreID(1), roachpb.StoreID(2), rangeUsageInfo.QueriesPerSecond) 536 desc, ok = sp.getStoreDescriptor(roachpb.StoreID(1)) 537 if !ok { 538 t.Fatalf("couldn't find StoreDescriptor for Store ID %d", 1) 539 } 540 if expectedLeaseCount := int32(0); desc.Capacity.LeaseCount != expectedLeaseCount { 541 t.Errorf("expected LeaseCount %d, but got %d", expectedLeaseCount, desc.Capacity.LeaseCount) 542 } 543 if expectedQPS := 100 - QPS; desc.Capacity.QueriesPerSecond != expectedQPS { 544 t.Errorf("expected QueriesPerSecond %f, but got %f", expectedQPS, desc.Capacity.QueriesPerSecond) 545 } 546 desc, ok = sp.getStoreDescriptor(roachpb.StoreID(2)) 547 if !ok { 548 t.Fatalf("couldn't find StoreDescriptor for Store ID %d", 2) 549 } 550 if expectedLeaseCount := int32(3); desc.Capacity.LeaseCount != expectedLeaseCount { 551 t.Errorf("expected LeaseCount %d, but got %d", expectedLeaseCount, desc.Capacity.LeaseCount) 552 } 553 if expectedQPS := 50 + QPS; desc.Capacity.QueriesPerSecond != expectedQPS { 554 t.Errorf("expected QueriesPerSecond %f, but got %f", expectedQPS, desc.Capacity.QueriesPerSecond) 555 } 556 } 557 558 // TestStorePoolUpdateLocalStoreBeforeGossip verifies that an attempt to update 559 // the local copy of store before that store has been gossiped will be a no-op. 560 func TestStorePoolUpdateLocalStoreBeforeGossip(t *testing.T) { 561 defer leaktest.AfterTest(t)() 562 ctx := context.Background() 563 manual := hlc.NewManualClock(123) 564 clock := hlc.NewClock(manual.UnixNano, time.Nanosecond) 565 stopper, _, _, sp, _ := createTestStorePool( 566 TestTimeUntilStoreDead, false, /* deterministic */ 567 func() int { return 10 }, /* nodeCount */ 568 kvserverpb.NodeLivenessStatus_DEAD) 569 defer stopper.Stop(ctx) 570 571 // Create store. 572 node := roachpb.NodeDescriptor{NodeID: roachpb.NodeID(1)} 573 eng := storage.NewDefaultInMem() 574 stopper.AddCloser(eng) 575 cfg := TestStoreConfig(clock) 576 cfg.Transport = NewDummyRaftTransport(cfg.Settings) 577 store := NewStore(ctx, cfg, eng, &node) 578 // Fake an ident because this test doesn't want to start the store 579 // but without an Ident there will be NPEs. 580 store.Ident = &roachpb.StoreIdent{ 581 ClusterID: uuid.Nil, 582 StoreID: 1, 583 NodeID: 1, 584 } 585 586 // Create replica. 587 rg := roachpb.RangeDescriptor{ 588 RangeID: 1, 589 StartKey: roachpb.RKey([]byte("a")), 590 EndKey: roachpb.RKey([]byte("b")), 591 NextReplicaID: 1, 592 } 593 rg.AddReplica(1, 1, roachpb.VOTER_FULL) 594 replica, err := newReplica(ctx, &rg, store, 1) 595 if err != nil { 596 t.Fatalf("make replica error : %+v", err) 597 } 598 replica.leaseholderStats = newReplicaStats(store.Clock(), nil) 599 600 rangeUsageInfo := rangeUsageInfoForRepl(replica) 601 602 // Update StorePool, which should be a no-op. 603 storeID := roachpb.StoreID(1) 604 if _, ok := sp.getStoreDescriptor(storeID); ok { 605 t.Fatalf("StoreDescriptor not gossiped, should not be found") 606 } 607 sp.updateLocalStoreAfterRebalance(storeID, rangeUsageInfo, roachpb.ADD_REPLICA) 608 if _, ok := sp.getStoreDescriptor(storeID); ok { 609 t.Fatalf("StoreDescriptor still not gossiped, should not be found") 610 } 611 } 612 613 func TestStorePoolGetStoreDetails(t *testing.T) { 614 defer leaktest.AfterTest(t)() 615 stopper, g, _, sp, _ := createTestStorePool( 616 TestTimeUntilStoreDead, false, /* deterministic */ 617 func() int { return 10 }, /* nodeCount */ 618 kvserverpb.NodeLivenessStatus_DEAD) 619 defer stopper.Stop(context.Background()) 620 sg := gossiputil.NewStoreGossiper(g) 621 sg.GossipStores(uniqueStore, t) 622 623 sp.detailsMu.Lock() 624 defer sp.detailsMu.Unlock() 625 if detail := sp.getStoreDetailLocked(roachpb.StoreID(1)); detail.desc != nil { 626 t.Errorf("unexpected fetched store ID 1: %+v", detail.desc) 627 } 628 if detail := sp.getStoreDetailLocked(roachpb.StoreID(2)); detail.desc == nil { 629 t.Errorf("failed to fetch store ID 2") 630 } 631 } 632 633 func TestStorePoolFindDeadReplicas(t *testing.T) { 634 defer leaktest.AfterTest(t)() 635 stopper, g, _, sp, mnl := createTestStorePool( 636 TestTimeUntilStoreDead, false, /* deterministic */ 637 func() int { return 10 }, /* nodeCount */ 638 kvserverpb.NodeLivenessStatus_DEAD) 639 defer stopper.Stop(context.Background()) 640 sg := gossiputil.NewStoreGossiper(g) 641 642 stores := []*roachpb.StoreDescriptor{ 643 { 644 StoreID: 1, 645 Node: roachpb.NodeDescriptor{NodeID: 1}, 646 }, 647 { 648 StoreID: 2, 649 Node: roachpb.NodeDescriptor{NodeID: 2}, 650 }, 651 { 652 StoreID: 3, 653 Node: roachpb.NodeDescriptor{NodeID: 3}, 654 }, 655 { 656 StoreID: 4, 657 Node: roachpb.NodeDescriptor{NodeID: 4}, 658 }, 659 { 660 StoreID: 5, 661 Node: roachpb.NodeDescriptor{NodeID: 5}, 662 }, 663 } 664 665 replicas := []roachpb.ReplicaDescriptor{ 666 { 667 NodeID: 1, 668 StoreID: 1, 669 ReplicaID: 1, 670 }, 671 { 672 NodeID: 2, 673 StoreID: 2, 674 ReplicaID: 2, 675 }, 676 { 677 NodeID: 3, 678 StoreID: 3, 679 ReplicaID: 4, 680 }, 681 { 682 NodeID: 4, 683 StoreID: 4, 684 ReplicaID: 4, 685 }, 686 { 687 NodeID: 5, 688 StoreID: 5, 689 ReplicaID: 5, 690 }, 691 } 692 693 sg.GossipStores(stores, t) 694 for i := 1; i <= 5; i++ { 695 mnl.setNodeStatus(roachpb.NodeID(i), kvserverpb.NodeLivenessStatus_LIVE) 696 } 697 698 liveReplicas, deadReplicas := sp.liveAndDeadReplicas(replicas) 699 if len(liveReplicas) != 5 { 700 t.Fatalf("expected five live replicas, found %d (%v)", len(liveReplicas), liveReplicas) 701 } 702 if len(deadReplicas) > 0 { 703 t.Fatalf("expected no dead replicas initially, found %d (%v)", len(deadReplicas), deadReplicas) 704 } 705 // Mark nodes 4 & 5 as dead. 706 mnl.setNodeStatus(4, kvserverpb.NodeLivenessStatus_DEAD) 707 mnl.setNodeStatus(5, kvserverpb.NodeLivenessStatus_DEAD) 708 709 liveReplicas, deadReplicas = sp.liveAndDeadReplicas(replicas) 710 if a, e := liveReplicas, replicas[:3]; !reflect.DeepEqual(a, e) { 711 t.Fatalf("expected live replicas %+v; got %+v", e, a) 712 } 713 if a, e := deadReplicas, replicas[3:]; !reflect.DeepEqual(a, e) { 714 t.Fatalf("expected dead replicas %+v; got %+v", e, a) 715 } 716 717 // Mark node 4 as merely unavailable. 718 mnl.setNodeStatus(4, kvserverpb.NodeLivenessStatus_UNAVAILABLE) 719 720 liveReplicas, deadReplicas = sp.liveAndDeadReplicas(replicas) 721 if a, e := liveReplicas, replicas[:3]; !reflect.DeepEqual(a, e) { 722 t.Fatalf("expected live replicas %+v; got %+v", e, a) 723 } 724 if a, e := deadReplicas, replicas[4:]; !reflect.DeepEqual(a, e) { 725 t.Fatalf("expected dead replicas %+v; got %+v", e, a) 726 } 727 } 728 729 // TestStorePoolDefaultState verifies that the default state of a 730 // store is neither alive nor dead. This is a regression test for a 731 // bug in which a call to deadReplicas involving an unknown store 732 // would have the side effect of marking that store as alive and 733 // eligible for return by getStoreList. It is therefore significant 734 // that the two methods are tested in the same test, and in this 735 // order. 736 func TestStorePoolDefaultState(t *testing.T) { 737 defer leaktest.AfterTest(t)() 738 stopper, _, _, sp, _ := createTestStorePool( 739 TestTimeUntilStoreDead, false, /* deterministic */ 740 func() int { return 10 }, /* nodeCount */ 741 kvserverpb.NodeLivenessStatus_DEAD) 742 defer stopper.Stop(context.Background()) 743 744 liveReplicas, deadReplicas := sp.liveAndDeadReplicas([]roachpb.ReplicaDescriptor{{StoreID: 1}}) 745 if len(liveReplicas) != 0 || len(deadReplicas) != 0 { 746 t.Errorf("expected 0 live and 0 dead replicas; got %v and %v", liveReplicas, deadReplicas) 747 } 748 749 sl, alive, throttled := sp.getStoreList(storeFilterNone) 750 if len(sl.stores) > 0 { 751 t.Errorf("expected no live stores; got list of %v", sl) 752 } 753 if alive != 0 { 754 t.Errorf("expected no live stores; got a live count of %d", alive) 755 } 756 if len(throttled) != 0 { 757 t.Errorf("expected no live stores; got throttled %v", throttled) 758 } 759 } 760 761 func TestStorePoolThrottle(t *testing.T) { 762 defer leaktest.AfterTest(t)() 763 stopper, g, _, sp, _ := createTestStorePool( 764 TestTimeUntilStoreDead, false, /* deterministic */ 765 func() int { return 10 }, /* nodeCount */ 766 kvserverpb.NodeLivenessStatus_DEAD) 767 defer stopper.Stop(context.Background()) 768 769 sg := gossiputil.NewStoreGossiper(g) 770 sg.GossipStores(uniqueStore, t) 771 772 { 773 expected := sp.clock.Now().GoTime().Add(DeclinedReservationsTimeout.Get(&sp.st.SV)) 774 sp.throttle(throttleDeclined, "", 1) 775 776 sp.detailsMu.Lock() 777 detail := sp.getStoreDetailLocked(1) 778 sp.detailsMu.Unlock() 779 if !detail.throttledUntil.Equal(expected) { 780 t.Errorf("expected store to have been throttled to %v, found %v", 781 expected, detail.throttledUntil) 782 } 783 } 784 785 { 786 expected := sp.clock.Now().GoTime().Add(FailedReservationsTimeout.Get(&sp.st.SV)) 787 sp.throttle(throttleFailed, "", 1) 788 789 sp.detailsMu.Lock() 790 detail := sp.getStoreDetailLocked(1) 791 sp.detailsMu.Unlock() 792 if !detail.throttledUntil.Equal(expected) { 793 t.Errorf("expected store to have been throttled to %v, found %v", 794 expected, detail.throttledUntil) 795 } 796 } 797 } 798 799 func TestGetLocalities(t *testing.T) { 800 defer leaktest.AfterTest(t)() 801 stopper, g, _, sp, _ := createTestStorePool( 802 TestTimeUntilStoreDead, false, /* deterministic */ 803 func() int { return 10 }, /* nodeCount */ 804 kvserverpb.NodeLivenessStatus_DEAD) 805 defer stopper.Stop(context.Background()) 806 sg := gossiputil.NewStoreGossiper(g) 807 808 // Creates a node with a locality with the number of tiers passed in. The 809 // NodeID is the same as the tier count. 810 createLocality := func(tierCount int) roachpb.Locality { 811 var locality roachpb.Locality 812 for i := 1; i <= tierCount; i++ { 813 value := fmt.Sprintf("%d", i) 814 locality.Tiers = append(locality.Tiers, roachpb.Tier{ 815 Key: value, 816 Value: value, 817 }) 818 } 819 return locality 820 } 821 createDescWithLocality := func(tierCount int) roachpb.NodeDescriptor { 822 return roachpb.NodeDescriptor{ 823 NodeID: roachpb.NodeID(tierCount), 824 Locality: createLocality(tierCount), 825 } 826 } 827 828 stores := []*roachpb.StoreDescriptor{ 829 { 830 StoreID: 1, 831 Node: createDescWithLocality(1), 832 }, 833 { 834 StoreID: 2, 835 Node: createDescWithLocality(2), 836 }, 837 { 838 StoreID: 3, 839 Node: createDescWithLocality(3), 840 }, 841 { 842 StoreID: 4, 843 Node: createDescWithLocality(2), 844 }, 845 } 846 847 sg.GossipStores(stores, t) 848 849 var existingReplicas []roachpb.ReplicaDescriptor 850 for _, store := range stores { 851 existingReplicas = append(existingReplicas, roachpb.ReplicaDescriptor{NodeID: store.Node.NodeID}) 852 } 853 854 localities := sp.getLocalities(existingReplicas) 855 for _, store := range stores { 856 nodeID := store.Node.NodeID 857 locality, ok := localities[nodeID] 858 if !ok { 859 t.Fatalf("could not find locality for node %d", nodeID) 860 } 861 if e, a := int(nodeID), len(locality.Tiers); e != a { 862 t.Fatalf("for node %d, expected %d tiers, only got %d", nodeID, e, a) 863 } 864 if e, a := createLocality(int(nodeID)).String(), sp.getNodeLocalityString(nodeID); e != a { 865 t.Fatalf("for getNodeLocalityString(%d), expected %q, got %q", nodeID, e, a) 866 } 867 } 868 } 869 870 func TestStorePoolDecommissioningReplicas(t *testing.T) { 871 defer leaktest.AfterTest(t)() 872 stopper, g, _, sp, mnl := createTestStorePool( 873 TestTimeUntilStoreDead, false, /* deterministic */ 874 func() int { return 10 }, /* nodeCount */ 875 kvserverpb.NodeLivenessStatus_DEAD) 876 defer stopper.Stop(context.Background()) 877 sg := gossiputil.NewStoreGossiper(g) 878 879 stores := []*roachpb.StoreDescriptor{ 880 { 881 StoreID: 1, 882 Node: roachpb.NodeDescriptor{NodeID: 1}, 883 }, 884 { 885 StoreID: 2, 886 Node: roachpb.NodeDescriptor{NodeID: 2}, 887 }, 888 { 889 StoreID: 3, 890 Node: roachpb.NodeDescriptor{NodeID: 3}, 891 }, 892 { 893 StoreID: 4, 894 Node: roachpb.NodeDescriptor{NodeID: 4}, 895 }, 896 { 897 StoreID: 5, 898 Node: roachpb.NodeDescriptor{NodeID: 5}, 899 }, 900 } 901 902 replicas := []roachpb.ReplicaDescriptor{ 903 { 904 NodeID: 1, 905 StoreID: 1, 906 ReplicaID: 1, 907 }, 908 { 909 NodeID: 2, 910 StoreID: 2, 911 ReplicaID: 2, 912 }, 913 { 914 NodeID: 3, 915 StoreID: 3, 916 ReplicaID: 4, 917 }, 918 { 919 NodeID: 4, 920 StoreID: 4, 921 ReplicaID: 4, 922 }, 923 { 924 NodeID: 5, 925 StoreID: 5, 926 ReplicaID: 5, 927 }, 928 } 929 930 sg.GossipStores(stores, t) 931 for i := 1; i <= 5; i++ { 932 mnl.setNodeStatus(roachpb.NodeID(i), kvserverpb.NodeLivenessStatus_LIVE) 933 } 934 935 liveReplicas, deadReplicas := sp.liveAndDeadReplicas(replicas) 936 if len(liveReplicas) != 5 { 937 t.Fatalf("expected five live replicas, found %d (%v)", len(liveReplicas), liveReplicas) 938 } 939 if len(deadReplicas) > 0 { 940 t.Fatalf("expected no dead replicas initially, found %d (%v)", len(deadReplicas), deadReplicas) 941 } 942 // Mark node 4 as decommissioning. 943 mnl.setNodeStatus(4, kvserverpb.NodeLivenessStatus_DECOMMISSIONING) 944 // Mark node 5 as dead. 945 mnl.setNodeStatus(5, kvserverpb.NodeLivenessStatus_DEAD) 946 947 liveReplicas, deadReplicas = sp.liveAndDeadReplicas(replicas) 948 // Decommissioning replicas are considered live. 949 if a, e := liveReplicas, replicas[:4]; !reflect.DeepEqual(a, e) { 950 t.Fatalf("expected live replicas %+v; got %+v", e, a) 951 } 952 if a, e := deadReplicas, replicas[4:]; !reflect.DeepEqual(a, e) { 953 t.Fatalf("expected dead replicas %+v; got %+v", e, a) 954 } 955 956 decommissioningReplicas := sp.decommissioningReplicas(replicas) 957 if a, e := decommissioningReplicas, replicas[3:4]; !reflect.DeepEqual(a, e) { 958 t.Fatalf("expected decommissioning replicas %+v; got %+v", e, a) 959 } 960 } 961 962 func TestNodeLivenessLivenessStatus(t *testing.T) { 963 defer leaktest.AfterTest(t)() 964 now := timeutil.Now() 965 threshold := 5 * time.Minute 966 967 for _, tc := range []struct { 968 liveness kvserverpb.Liveness 969 expected kvserverpb.NodeLivenessStatus 970 }{ 971 // Valid status. 972 { 973 liveness: kvserverpb.Liveness{ 974 NodeID: 1, 975 Epoch: 1, 976 Expiration: hlc.LegacyTimestamp{ 977 WallTime: now.Add(5 * time.Minute).UnixNano(), 978 }, 979 Decommissioning: false, 980 Draining: false, 981 }, 982 expected: kvserverpb.NodeLivenessStatus_LIVE, 983 }, 984 { 985 liveness: kvserverpb.Liveness{ 986 NodeID: 1, 987 Epoch: 1, 988 Expiration: hlc.LegacyTimestamp{ 989 // Expires just slightly in the future. 990 WallTime: now.UnixNano() + 1, 991 }, 992 Decommissioning: false, 993 Draining: false, 994 }, 995 expected: kvserverpb.NodeLivenessStatus_LIVE, 996 }, 997 // Expired status. 998 { 999 liveness: kvserverpb.Liveness{ 1000 NodeID: 1, 1001 Epoch: 1, 1002 Expiration: hlc.LegacyTimestamp{ 1003 // Just expired. 1004 WallTime: now.UnixNano(), 1005 }, 1006 Decommissioning: false, 1007 Draining: false, 1008 }, 1009 expected: kvserverpb.NodeLivenessStatus_UNAVAILABLE, 1010 }, 1011 // Expired status. 1012 { 1013 liveness: kvserverpb.Liveness{ 1014 NodeID: 1, 1015 Epoch: 1, 1016 Expiration: hlc.LegacyTimestamp{ 1017 WallTime: now.UnixNano(), 1018 }, 1019 Decommissioning: false, 1020 Draining: false, 1021 }, 1022 expected: kvserverpb.NodeLivenessStatus_UNAVAILABLE, 1023 }, 1024 // Max bound of expired. 1025 { 1026 liveness: kvserverpb.Liveness{ 1027 NodeID: 1, 1028 Epoch: 1, 1029 Expiration: hlc.LegacyTimestamp{ 1030 WallTime: now.Add(-threshold).UnixNano() + 1, 1031 }, 1032 Decommissioning: false, 1033 Draining: false, 1034 }, 1035 expected: kvserverpb.NodeLivenessStatus_UNAVAILABLE, 1036 }, 1037 // Dead status. 1038 { 1039 liveness: kvserverpb.Liveness{ 1040 NodeID: 1, 1041 Epoch: 1, 1042 Expiration: hlc.LegacyTimestamp{ 1043 WallTime: now.Add(-threshold).UnixNano(), 1044 }, 1045 Decommissioning: false, 1046 Draining: false, 1047 }, 1048 expected: kvserverpb.NodeLivenessStatus_DEAD, 1049 }, 1050 // Decommissioning. 1051 { 1052 liveness: kvserverpb.Liveness{ 1053 NodeID: 1, 1054 Epoch: 1, 1055 Expiration: hlc.LegacyTimestamp{ 1056 WallTime: now.Add(time.Second).UnixNano(), 1057 }, 1058 Decommissioning: true, 1059 Draining: false, 1060 }, 1061 expected: kvserverpb.NodeLivenessStatus_DECOMMISSIONING, 1062 }, 1063 // Decommissioned. 1064 { 1065 liveness: kvserverpb.Liveness{ 1066 NodeID: 1, 1067 Epoch: 1, 1068 Expiration: hlc.LegacyTimestamp{ 1069 WallTime: now.Add(-threshold).UnixNano(), 1070 }, 1071 Decommissioning: true, 1072 Draining: false, 1073 }, 1074 expected: kvserverpb.NodeLivenessStatus_DECOMMISSIONED, 1075 }, 1076 // Draining (reports as unavailable). 1077 { 1078 liveness: kvserverpb.Liveness{ 1079 NodeID: 1, 1080 Epoch: 1, 1081 Expiration: hlc.LegacyTimestamp{ 1082 WallTime: now.Add(5 * time.Minute).UnixNano(), 1083 }, 1084 Decommissioning: false, 1085 Draining: true, 1086 }, 1087 expected: kvserverpb.NodeLivenessStatus_UNAVAILABLE, 1088 }, 1089 } { 1090 t.Run("", func(t *testing.T) { 1091 if a, e := LivenessStatus(tc.liveness, now, threshold), tc.expected; a != e { 1092 t.Errorf("liveness status was %s, wanted %s", a.String(), e.String()) 1093 } 1094 }) 1095 } 1096 }