github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_rebalancer_test.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "reflect" 16 "sort" 17 "testing" 18 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 21 "github.com/cockroachdb/cockroach/pkg/testutils/gossiputil" 22 "github.com/cockroachdb/cockroach/pkg/util/hlc" 23 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 24 "github.com/cockroachdb/cockroach/pkg/util/stop" 25 "github.com/gogo/protobuf/proto" 26 "go.etcd.io/etcd/raft" 27 "go.etcd.io/etcd/raft/tracker" 28 ) 29 30 var ( 31 // noLocalityStores specifies a set of stores where one store is 32 // under-utilized in terms of QPS, three are in the middle, and one is 33 // over-utilized. 34 noLocalityStores = []*roachpb.StoreDescriptor{ 35 { 36 StoreID: 1, 37 Node: roachpb.NodeDescriptor{NodeID: 1}, 38 Capacity: roachpb.StoreCapacity{ 39 QueriesPerSecond: 1500, 40 }, 41 }, 42 { 43 StoreID: 2, 44 Node: roachpb.NodeDescriptor{NodeID: 2}, 45 Capacity: roachpb.StoreCapacity{ 46 QueriesPerSecond: 1100, 47 }, 48 }, 49 { 50 StoreID: 3, 51 Node: roachpb.NodeDescriptor{NodeID: 3}, 52 Capacity: roachpb.StoreCapacity{ 53 QueriesPerSecond: 1000, 54 }, 55 }, 56 { 57 StoreID: 4, 58 Node: roachpb.NodeDescriptor{NodeID: 4}, 59 Capacity: roachpb.StoreCapacity{ 60 QueriesPerSecond: 900, 61 }, 62 }, 63 { 64 StoreID: 5, 65 Node: roachpb.NodeDescriptor{NodeID: 5}, 66 Capacity: roachpb.StoreCapacity{ 67 QueriesPerSecond: 500, 68 }, 69 }, 70 } 71 ) 72 73 type testRange struct { 74 // The first storeID in the list will be the leaseholder. 75 storeIDs []roachpb.StoreID 76 qps float64 77 } 78 79 func loadRanges(rr *replicaRankings, s *Store, ranges []testRange) { 80 acc := rr.newAccumulator() 81 for _, r := range ranges { 82 repl := &Replica{store: s} 83 repl.mu.state.Desc = &roachpb.RangeDescriptor{} 84 repl.mu.zone = s.cfg.DefaultZoneConfig 85 for _, storeID := range r.storeIDs { 86 repl.mu.state.Desc.InternalReplicas = append(repl.mu.state.Desc.InternalReplicas, roachpb.ReplicaDescriptor{ 87 NodeID: roachpb.NodeID(storeID), 88 StoreID: storeID, 89 ReplicaID: roachpb.ReplicaID(storeID), 90 }) 91 } 92 repl.mu.state.Lease = &roachpb.Lease{ 93 Expiration: &hlc.MaxTimestamp, 94 Replica: repl.mu.state.Desc.InternalReplicas[0], 95 } 96 // TODO(a-robinson): The below three lines won't be needed once the old 97 // rangeInfo code is ripped out of the allocator. 98 repl.mu.state.Stats = &enginepb.MVCCStats{} 99 repl.leaseholderStats = newReplicaStats(s.Clock(), nil) 100 repl.writeStats = newReplicaStats(s.Clock(), nil) 101 acc.addReplica(replicaWithStats{ 102 repl: repl, 103 qps: r.qps, 104 }) 105 } 106 rr.update(acc) 107 } 108 109 func TestChooseLeaseToTransfer(t *testing.T) { 110 defer leaktest.AfterTest(t)() 111 112 ctx := context.Background() 113 stopper := stop.NewStopper() 114 defer stopper.Stop(ctx) 115 116 stopper, g, _, a, _ := createTestAllocator(10, false /* deterministic */) 117 defer stopper.Stop(context.Background()) 118 gossiputil.NewStoreGossiper(g).GossipStores(noLocalityStores, t) 119 storeList, _, _ := a.storePool.getStoreList(storeFilterThrottled) 120 storeMap := storeListToMap(storeList) 121 122 const minQPS = 800 123 const maxQPS = 1200 124 125 localDesc := *noLocalityStores[0] 126 cfg := TestStoreConfig(nil) 127 s := createTestStoreWithoutStart(t, stopper, testStoreOpts{createSystemRanges: true}, &cfg) 128 s.Ident = &roachpb.StoreIdent{StoreID: localDesc.StoreID} 129 rq := newReplicateQueue(s, g, a) 130 rr := newReplicaRankings() 131 132 sr := NewStoreRebalancer(cfg.AmbientCtx, cfg.Settings, rq, rr) 133 134 // Rather than trying to populate every Replica with a real raft group in 135 // order to pass replicaIsBehind checks, fake out the function for getting 136 // raft status with one that always returns all replicas as up to date. 137 sr.getRaftStatusFn = func(r *Replica) *raft.Status { 138 status := &raft.Status{ 139 Progress: make(map[uint64]tracker.Progress), 140 } 141 status.Lead = uint64(r.ReplicaID()) 142 status.Commit = 1 143 for _, replica := range r.Desc().InternalReplicas { 144 status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ 145 Match: 1, 146 State: tracker.StateReplicate, 147 } 148 } 149 return status 150 } 151 152 testCases := []struct { 153 storeIDs []roachpb.StoreID 154 qps float64 155 expectTarget roachpb.StoreID 156 }{ 157 {[]roachpb.StoreID{1}, 100, 0}, 158 {[]roachpb.StoreID{1, 2}, 100, 0}, 159 {[]roachpb.StoreID{1, 3}, 100, 0}, 160 {[]roachpb.StoreID{1, 4}, 100, 4}, 161 {[]roachpb.StoreID{1, 5}, 100, 5}, 162 {[]roachpb.StoreID{5, 1}, 100, 0}, 163 {[]roachpb.StoreID{1, 2}, 200, 0}, 164 {[]roachpb.StoreID{1, 3}, 200, 0}, 165 {[]roachpb.StoreID{1, 4}, 200, 0}, 166 {[]roachpb.StoreID{1, 5}, 200, 5}, 167 {[]roachpb.StoreID{1, 2}, 500, 0}, 168 {[]roachpb.StoreID{1, 3}, 500, 0}, 169 {[]roachpb.StoreID{1, 4}, 500, 0}, 170 {[]roachpb.StoreID{1, 5}, 500, 5}, 171 {[]roachpb.StoreID{1, 5}, 600, 5}, 172 {[]roachpb.StoreID{1, 5}, 700, 5}, 173 {[]roachpb.StoreID{1, 5}, 800, 0}, 174 {[]roachpb.StoreID{1, 4}, 1.5, 4}, 175 {[]roachpb.StoreID{1, 5}, 1.5, 5}, 176 {[]roachpb.StoreID{1, 4}, 1.49, 0}, 177 {[]roachpb.StoreID{1, 5}, 1.49, 0}, 178 } 179 180 for _, tc := range testCases { 181 loadRanges(rr, s, []testRange{{storeIDs: tc.storeIDs, qps: tc.qps}}) 182 hottestRanges := rr.topQPS() 183 _, target, _ := sr.chooseLeaseToTransfer( 184 ctx, &hottestRanges, &localDesc, storeList, storeMap, minQPS, maxQPS) 185 if target.StoreID != tc.expectTarget { 186 t.Errorf("got target store %d for range with replicas %v and %f qps; want %d", 187 target.StoreID, tc.storeIDs, tc.qps, tc.expectTarget) 188 } 189 } 190 } 191 192 func TestChooseReplicaToRebalance(t *testing.T) { 193 defer leaktest.AfterTest(t)() 194 195 ctx := context.Background() 196 stopper := stop.NewStopper() 197 defer stopper.Stop(ctx) 198 199 stopper, g, _, a, _ := createTestAllocator(10, false /* deterministic */) 200 defer stopper.Stop(context.Background()) 201 gossiputil.NewStoreGossiper(g).GossipStores(noLocalityStores, t) 202 storeList, _, _ := a.storePool.getStoreList(storeFilterThrottled) 203 storeMap := storeListToMap(storeList) 204 205 const minQPS = 800 206 const maxQPS = 1200 207 208 localDesc := *noLocalityStores[0] 209 cfg := TestStoreConfig(nil) 210 s := createTestStoreWithoutStart(t, stopper, testStoreOpts{createSystemRanges: true}, &cfg) 211 s.Ident = &roachpb.StoreIdent{StoreID: localDesc.StoreID} 212 rq := newReplicateQueue(s, g, a) 213 rr := newReplicaRankings() 214 215 sr := NewStoreRebalancer(cfg.AmbientCtx, cfg.Settings, rq, rr) 216 217 // Rather than trying to populate every Replica with a real raft group in 218 // order to pass replicaIsBehind checks, fake out the function for getting 219 // raft status with one that always returns all replicas as up to date. 220 sr.getRaftStatusFn = func(r *Replica) *raft.Status { 221 status := &raft.Status{ 222 Progress: make(map[uint64]tracker.Progress), 223 } 224 status.Lead = uint64(r.ReplicaID()) 225 status.Commit = 1 226 for _, replica := range r.Desc().InternalReplicas { 227 status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ 228 Match: 1, 229 State: tracker.StateReplicate, 230 } 231 } 232 return status 233 } 234 235 testCases := []struct { 236 storeIDs []roachpb.StoreID 237 qps float64 238 expectTargets []roachpb.StoreID // the first listed store is expected to be the leaseholder 239 }{ 240 {[]roachpb.StoreID{1}, 100, []roachpb.StoreID{5}}, 241 {[]roachpb.StoreID{1}, 500, []roachpb.StoreID{5}}, 242 {[]roachpb.StoreID{1}, 700, []roachpb.StoreID{5}}, 243 {[]roachpb.StoreID{1}, 800, nil}, 244 {[]roachpb.StoreID{1}, 1.5, []roachpb.StoreID{5}}, 245 {[]roachpb.StoreID{1}, 1.49, nil}, 246 {[]roachpb.StoreID{1, 2}, 100, []roachpb.StoreID{5, 2}}, 247 {[]roachpb.StoreID{1, 3}, 100, []roachpb.StoreID{5, 3}}, 248 {[]roachpb.StoreID{1, 4}, 100, []roachpb.StoreID{5, 4}}, 249 {[]roachpb.StoreID{1, 2}, 800, nil}, 250 {[]roachpb.StoreID{1, 2}, 1.49, nil}, 251 {[]roachpb.StoreID{1, 4, 5}, 500, nil}, 252 {[]roachpb.StoreID{1, 4, 5}, 100, nil}, 253 {[]roachpb.StoreID{1, 3, 5}, 500, nil}, 254 {[]roachpb.StoreID{1, 3, 4}, 500, []roachpb.StoreID{5, 4, 3}}, 255 {[]roachpb.StoreID{1, 3, 5}, 100, []roachpb.StoreID{5, 4, 3}}, 256 // Rebalancing to s2 isn't chosen even though it's better than s1 because it's above the mean. 257 {[]roachpb.StoreID{1, 3, 4, 5}, 100, nil}, 258 {[]roachpb.StoreID{1, 2, 4, 5}, 100, nil}, 259 {[]roachpb.StoreID{1, 2, 3, 5}, 100, []roachpb.StoreID{5, 4, 3, 2}}, 260 {[]roachpb.StoreID{1, 2, 3, 4}, 100, []roachpb.StoreID{5, 4, 3, 2}}, 261 } 262 263 for _, tc := range testCases { 264 t.Run("", func(t *testing.T) { 265 s.cfg.DefaultZoneConfig.NumReplicas = proto.Int32(int32(len(tc.storeIDs))) 266 loadRanges(rr, s, []testRange{{storeIDs: tc.storeIDs, qps: tc.qps}}) 267 hottestRanges := rr.topQPS() 268 _, targets := sr.chooseReplicaToRebalance( 269 ctx, &hottestRanges, &localDesc, storeList, storeMap, minQPS, maxQPS) 270 271 if len(targets) != len(tc.expectTargets) { 272 t.Fatalf("chooseReplicaToRebalance(existing=%v, qps=%f) got %v; want %v", 273 tc.storeIDs, tc.qps, targets, tc.expectTargets) 274 } 275 if len(targets) == 0 { 276 return 277 } 278 279 if targets[0].StoreID != tc.expectTargets[0] { 280 t.Errorf("chooseReplicaToRebalance(existing=%v, qps=%f) chose s%d as leaseholder; want s%v", 281 tc.storeIDs, tc.qps, targets[0], tc.expectTargets[0]) 282 } 283 284 targetStores := make([]roachpb.StoreID, len(targets)) 285 for i, target := range targets { 286 targetStores[i] = target.StoreID 287 } 288 sort.Sort(roachpb.StoreIDSlice(targetStores)) 289 sort.Sort(roachpb.StoreIDSlice(tc.expectTargets)) 290 if !reflect.DeepEqual(targetStores, tc.expectTargets) { 291 t.Errorf("chooseReplicaToRebalance(existing=%v, qps=%f) chose targets %v; want %v", 292 tc.storeIDs, tc.qps, targetStores, tc.expectTargets) 293 } 294 }) 295 } 296 } 297 298 func TestNoLeaseTransferToBehindReplicas(t *testing.T) { 299 defer leaktest.AfterTest(t)() 300 301 // Lots of setup boilerplate. 302 303 ctx := context.Background() 304 stopper := stop.NewStopper() 305 defer stopper.Stop(ctx) 306 307 stopper, g, _, a, _ := createTestAllocator(10, false /* deterministic */) 308 defer stopper.Stop(context.Background()) 309 gossiputil.NewStoreGossiper(g).GossipStores(noLocalityStores, t) 310 storeList, _, _ := a.storePool.getStoreList(storeFilterThrottled) 311 storeMap := storeListToMap(storeList) 312 313 const minQPS = 800 314 const maxQPS = 1200 315 316 localDesc := *noLocalityStores[0] 317 cfg := TestStoreConfig(nil) 318 s := createTestStoreWithoutStart(t, stopper, testStoreOpts{createSystemRanges: true}, &cfg) 319 s.Ident = &roachpb.StoreIdent{StoreID: localDesc.StoreID} 320 rq := newReplicateQueue(s, g, a) 321 rr := newReplicaRankings() 322 323 sr := NewStoreRebalancer(cfg.AmbientCtx, cfg.Settings, rq, rr) 324 325 // Load in a range with replicas on an overfull node, a slightly underfull 326 // node, and a very underfull node. 327 loadRanges(rr, s, []testRange{{storeIDs: []roachpb.StoreID{1, 4, 5}, qps: 100}}) 328 hottestRanges := rr.topQPS() 329 repl := hottestRanges[0].repl 330 331 // Set up a fake RaftStatus that indicates s5 is behind (but all other stores 332 // are caught up). We thus shouldn't transfer a lease to s5. 333 sr.getRaftStatusFn = func(r *Replica) *raft.Status { 334 status := &raft.Status{ 335 Progress: make(map[uint64]tracker.Progress), 336 } 337 status.Lead = uint64(r.ReplicaID()) 338 status.Commit = 1 339 for _, replica := range r.Desc().InternalReplicas { 340 match := uint64(1) 341 if replica.StoreID == roachpb.StoreID(5) { 342 match = 0 343 } 344 status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ 345 Match: match, 346 State: tracker.StateReplicate, 347 } 348 } 349 return status 350 } 351 352 _, target, _ := sr.chooseLeaseToTransfer( 353 ctx, &hottestRanges, &localDesc, storeList, storeMap, minQPS, maxQPS) 354 expectTarget := roachpb.StoreID(4) 355 if target.StoreID != expectTarget { 356 t.Errorf("got target store s%d for range with RaftStatus %v; want s%d", 357 target.StoreID, sr.getRaftStatusFn(repl), expectTarget) 358 } 359 360 // Then do the same, but for replica rebalancing. Make s5 an existing replica 361 // that's behind, and see how a new replica is preferred as the leaseholder 362 // over it. 363 loadRanges(rr, s, []testRange{{storeIDs: []roachpb.StoreID{1, 3, 5}, qps: 100}}) 364 hottestRanges = rr.topQPS() 365 repl = hottestRanges[0].repl 366 367 _, targets := sr.chooseReplicaToRebalance( 368 ctx, &hottestRanges, &localDesc, storeList, storeMap, minQPS, maxQPS) 369 expectTargets := []roachpb.ReplicationTarget{ 370 {NodeID: 4, StoreID: 4}, {NodeID: 5, StoreID: 5}, {NodeID: 3, StoreID: 3}, 371 } 372 if !reflect.DeepEqual(targets, expectTargets) { 373 t.Errorf("got targets %v for range with RaftStatus %v; want %v", 374 targets, sr.getRaftStatusFn(repl), expectTargets) 375 } 376 }