github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_rebalancer_test.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"reflect"
    16  	"sort"
    17  	"testing"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    21  	"github.com/cockroachdb/cockroach/pkg/testutils/gossiputil"
    22  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    23  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    24  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    25  	"github.com/gogo/protobuf/proto"
    26  	"go.etcd.io/etcd/raft"
    27  	"go.etcd.io/etcd/raft/tracker"
    28  )
    29  
    30  var (
    31  	// noLocalityStores specifies a set of stores where one store is
    32  	// under-utilized in terms of QPS, three are in the middle, and one is
    33  	// over-utilized.
    34  	noLocalityStores = []*roachpb.StoreDescriptor{
    35  		{
    36  			StoreID: 1,
    37  			Node:    roachpb.NodeDescriptor{NodeID: 1},
    38  			Capacity: roachpb.StoreCapacity{
    39  				QueriesPerSecond: 1500,
    40  			},
    41  		},
    42  		{
    43  			StoreID: 2,
    44  			Node:    roachpb.NodeDescriptor{NodeID: 2},
    45  			Capacity: roachpb.StoreCapacity{
    46  				QueriesPerSecond: 1100,
    47  			},
    48  		},
    49  		{
    50  			StoreID: 3,
    51  			Node:    roachpb.NodeDescriptor{NodeID: 3},
    52  			Capacity: roachpb.StoreCapacity{
    53  				QueriesPerSecond: 1000,
    54  			},
    55  		},
    56  		{
    57  			StoreID: 4,
    58  			Node:    roachpb.NodeDescriptor{NodeID: 4},
    59  			Capacity: roachpb.StoreCapacity{
    60  				QueriesPerSecond: 900,
    61  			},
    62  		},
    63  		{
    64  			StoreID: 5,
    65  			Node:    roachpb.NodeDescriptor{NodeID: 5},
    66  			Capacity: roachpb.StoreCapacity{
    67  				QueriesPerSecond: 500,
    68  			},
    69  		},
    70  	}
    71  )
    72  
    73  type testRange struct {
    74  	// The first storeID in the list will be the leaseholder.
    75  	storeIDs []roachpb.StoreID
    76  	qps      float64
    77  }
    78  
    79  func loadRanges(rr *replicaRankings, s *Store, ranges []testRange) {
    80  	acc := rr.newAccumulator()
    81  	for _, r := range ranges {
    82  		repl := &Replica{store: s}
    83  		repl.mu.state.Desc = &roachpb.RangeDescriptor{}
    84  		repl.mu.zone = s.cfg.DefaultZoneConfig
    85  		for _, storeID := range r.storeIDs {
    86  			repl.mu.state.Desc.InternalReplicas = append(repl.mu.state.Desc.InternalReplicas, roachpb.ReplicaDescriptor{
    87  				NodeID:    roachpb.NodeID(storeID),
    88  				StoreID:   storeID,
    89  				ReplicaID: roachpb.ReplicaID(storeID),
    90  			})
    91  		}
    92  		repl.mu.state.Lease = &roachpb.Lease{
    93  			Expiration: &hlc.MaxTimestamp,
    94  			Replica:    repl.mu.state.Desc.InternalReplicas[0],
    95  		}
    96  		// TODO(a-robinson): The below three lines won't be needed once the old
    97  		// rangeInfo code is ripped out of the allocator.
    98  		repl.mu.state.Stats = &enginepb.MVCCStats{}
    99  		repl.leaseholderStats = newReplicaStats(s.Clock(), nil)
   100  		repl.writeStats = newReplicaStats(s.Clock(), nil)
   101  		acc.addReplica(replicaWithStats{
   102  			repl: repl,
   103  			qps:  r.qps,
   104  		})
   105  	}
   106  	rr.update(acc)
   107  }
   108  
   109  func TestChooseLeaseToTransfer(t *testing.T) {
   110  	defer leaktest.AfterTest(t)()
   111  
   112  	ctx := context.Background()
   113  	stopper := stop.NewStopper()
   114  	defer stopper.Stop(ctx)
   115  
   116  	stopper, g, _, a, _ := createTestAllocator(10, false /* deterministic */)
   117  	defer stopper.Stop(context.Background())
   118  	gossiputil.NewStoreGossiper(g).GossipStores(noLocalityStores, t)
   119  	storeList, _, _ := a.storePool.getStoreList(storeFilterThrottled)
   120  	storeMap := storeListToMap(storeList)
   121  
   122  	const minQPS = 800
   123  	const maxQPS = 1200
   124  
   125  	localDesc := *noLocalityStores[0]
   126  	cfg := TestStoreConfig(nil)
   127  	s := createTestStoreWithoutStart(t, stopper, testStoreOpts{createSystemRanges: true}, &cfg)
   128  	s.Ident = &roachpb.StoreIdent{StoreID: localDesc.StoreID}
   129  	rq := newReplicateQueue(s, g, a)
   130  	rr := newReplicaRankings()
   131  
   132  	sr := NewStoreRebalancer(cfg.AmbientCtx, cfg.Settings, rq, rr)
   133  
   134  	// Rather than trying to populate every Replica with a real raft group in
   135  	// order to pass replicaIsBehind checks, fake out the function for getting
   136  	// raft status with one that always returns all replicas as up to date.
   137  	sr.getRaftStatusFn = func(r *Replica) *raft.Status {
   138  		status := &raft.Status{
   139  			Progress: make(map[uint64]tracker.Progress),
   140  		}
   141  		status.Lead = uint64(r.ReplicaID())
   142  		status.Commit = 1
   143  		for _, replica := range r.Desc().InternalReplicas {
   144  			status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{
   145  				Match: 1,
   146  				State: tracker.StateReplicate,
   147  			}
   148  		}
   149  		return status
   150  	}
   151  
   152  	testCases := []struct {
   153  		storeIDs     []roachpb.StoreID
   154  		qps          float64
   155  		expectTarget roachpb.StoreID
   156  	}{
   157  		{[]roachpb.StoreID{1}, 100, 0},
   158  		{[]roachpb.StoreID{1, 2}, 100, 0},
   159  		{[]roachpb.StoreID{1, 3}, 100, 0},
   160  		{[]roachpb.StoreID{1, 4}, 100, 4},
   161  		{[]roachpb.StoreID{1, 5}, 100, 5},
   162  		{[]roachpb.StoreID{5, 1}, 100, 0},
   163  		{[]roachpb.StoreID{1, 2}, 200, 0},
   164  		{[]roachpb.StoreID{1, 3}, 200, 0},
   165  		{[]roachpb.StoreID{1, 4}, 200, 0},
   166  		{[]roachpb.StoreID{1, 5}, 200, 5},
   167  		{[]roachpb.StoreID{1, 2}, 500, 0},
   168  		{[]roachpb.StoreID{1, 3}, 500, 0},
   169  		{[]roachpb.StoreID{1, 4}, 500, 0},
   170  		{[]roachpb.StoreID{1, 5}, 500, 5},
   171  		{[]roachpb.StoreID{1, 5}, 600, 5},
   172  		{[]roachpb.StoreID{1, 5}, 700, 5},
   173  		{[]roachpb.StoreID{1, 5}, 800, 0},
   174  		{[]roachpb.StoreID{1, 4}, 1.5, 4},
   175  		{[]roachpb.StoreID{1, 5}, 1.5, 5},
   176  		{[]roachpb.StoreID{1, 4}, 1.49, 0},
   177  		{[]roachpb.StoreID{1, 5}, 1.49, 0},
   178  	}
   179  
   180  	for _, tc := range testCases {
   181  		loadRanges(rr, s, []testRange{{storeIDs: tc.storeIDs, qps: tc.qps}})
   182  		hottestRanges := rr.topQPS()
   183  		_, target, _ := sr.chooseLeaseToTransfer(
   184  			ctx, &hottestRanges, &localDesc, storeList, storeMap, minQPS, maxQPS)
   185  		if target.StoreID != tc.expectTarget {
   186  			t.Errorf("got target store %d for range with replicas %v and %f qps; want %d",
   187  				target.StoreID, tc.storeIDs, tc.qps, tc.expectTarget)
   188  		}
   189  	}
   190  }
   191  
   192  func TestChooseReplicaToRebalance(t *testing.T) {
   193  	defer leaktest.AfterTest(t)()
   194  
   195  	ctx := context.Background()
   196  	stopper := stop.NewStopper()
   197  	defer stopper.Stop(ctx)
   198  
   199  	stopper, g, _, a, _ := createTestAllocator(10, false /* deterministic */)
   200  	defer stopper.Stop(context.Background())
   201  	gossiputil.NewStoreGossiper(g).GossipStores(noLocalityStores, t)
   202  	storeList, _, _ := a.storePool.getStoreList(storeFilterThrottled)
   203  	storeMap := storeListToMap(storeList)
   204  
   205  	const minQPS = 800
   206  	const maxQPS = 1200
   207  
   208  	localDesc := *noLocalityStores[0]
   209  	cfg := TestStoreConfig(nil)
   210  	s := createTestStoreWithoutStart(t, stopper, testStoreOpts{createSystemRanges: true}, &cfg)
   211  	s.Ident = &roachpb.StoreIdent{StoreID: localDesc.StoreID}
   212  	rq := newReplicateQueue(s, g, a)
   213  	rr := newReplicaRankings()
   214  
   215  	sr := NewStoreRebalancer(cfg.AmbientCtx, cfg.Settings, rq, rr)
   216  
   217  	// Rather than trying to populate every Replica with a real raft group in
   218  	// order to pass replicaIsBehind checks, fake out the function for getting
   219  	// raft status with one that always returns all replicas as up to date.
   220  	sr.getRaftStatusFn = func(r *Replica) *raft.Status {
   221  		status := &raft.Status{
   222  			Progress: make(map[uint64]tracker.Progress),
   223  		}
   224  		status.Lead = uint64(r.ReplicaID())
   225  		status.Commit = 1
   226  		for _, replica := range r.Desc().InternalReplicas {
   227  			status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{
   228  				Match: 1,
   229  				State: tracker.StateReplicate,
   230  			}
   231  		}
   232  		return status
   233  	}
   234  
   235  	testCases := []struct {
   236  		storeIDs      []roachpb.StoreID
   237  		qps           float64
   238  		expectTargets []roachpb.StoreID // the first listed store is expected to be the leaseholder
   239  	}{
   240  		{[]roachpb.StoreID{1}, 100, []roachpb.StoreID{5}},
   241  		{[]roachpb.StoreID{1}, 500, []roachpb.StoreID{5}},
   242  		{[]roachpb.StoreID{1}, 700, []roachpb.StoreID{5}},
   243  		{[]roachpb.StoreID{1}, 800, nil},
   244  		{[]roachpb.StoreID{1}, 1.5, []roachpb.StoreID{5}},
   245  		{[]roachpb.StoreID{1}, 1.49, nil},
   246  		{[]roachpb.StoreID{1, 2}, 100, []roachpb.StoreID{5, 2}},
   247  		{[]roachpb.StoreID{1, 3}, 100, []roachpb.StoreID{5, 3}},
   248  		{[]roachpb.StoreID{1, 4}, 100, []roachpb.StoreID{5, 4}},
   249  		{[]roachpb.StoreID{1, 2}, 800, nil},
   250  		{[]roachpb.StoreID{1, 2}, 1.49, nil},
   251  		{[]roachpb.StoreID{1, 4, 5}, 500, nil},
   252  		{[]roachpb.StoreID{1, 4, 5}, 100, nil},
   253  		{[]roachpb.StoreID{1, 3, 5}, 500, nil},
   254  		{[]roachpb.StoreID{1, 3, 4}, 500, []roachpb.StoreID{5, 4, 3}},
   255  		{[]roachpb.StoreID{1, 3, 5}, 100, []roachpb.StoreID{5, 4, 3}},
   256  		// Rebalancing to s2 isn't chosen even though it's better than s1 because it's above the mean.
   257  		{[]roachpb.StoreID{1, 3, 4, 5}, 100, nil},
   258  		{[]roachpb.StoreID{1, 2, 4, 5}, 100, nil},
   259  		{[]roachpb.StoreID{1, 2, 3, 5}, 100, []roachpb.StoreID{5, 4, 3, 2}},
   260  		{[]roachpb.StoreID{1, 2, 3, 4}, 100, []roachpb.StoreID{5, 4, 3, 2}},
   261  	}
   262  
   263  	for _, tc := range testCases {
   264  		t.Run("", func(t *testing.T) {
   265  			s.cfg.DefaultZoneConfig.NumReplicas = proto.Int32(int32(len(tc.storeIDs)))
   266  			loadRanges(rr, s, []testRange{{storeIDs: tc.storeIDs, qps: tc.qps}})
   267  			hottestRanges := rr.topQPS()
   268  			_, targets := sr.chooseReplicaToRebalance(
   269  				ctx, &hottestRanges, &localDesc, storeList, storeMap, minQPS, maxQPS)
   270  
   271  			if len(targets) != len(tc.expectTargets) {
   272  				t.Fatalf("chooseReplicaToRebalance(existing=%v, qps=%f) got %v; want %v",
   273  					tc.storeIDs, tc.qps, targets, tc.expectTargets)
   274  			}
   275  			if len(targets) == 0 {
   276  				return
   277  			}
   278  
   279  			if targets[0].StoreID != tc.expectTargets[0] {
   280  				t.Errorf("chooseReplicaToRebalance(existing=%v, qps=%f) chose s%d as leaseholder; want s%v",
   281  					tc.storeIDs, tc.qps, targets[0], tc.expectTargets[0])
   282  			}
   283  
   284  			targetStores := make([]roachpb.StoreID, len(targets))
   285  			for i, target := range targets {
   286  				targetStores[i] = target.StoreID
   287  			}
   288  			sort.Sort(roachpb.StoreIDSlice(targetStores))
   289  			sort.Sort(roachpb.StoreIDSlice(tc.expectTargets))
   290  			if !reflect.DeepEqual(targetStores, tc.expectTargets) {
   291  				t.Errorf("chooseReplicaToRebalance(existing=%v, qps=%f) chose targets %v; want %v",
   292  					tc.storeIDs, tc.qps, targetStores, tc.expectTargets)
   293  			}
   294  		})
   295  	}
   296  }
   297  
   298  func TestNoLeaseTransferToBehindReplicas(t *testing.T) {
   299  	defer leaktest.AfterTest(t)()
   300  
   301  	// Lots of setup boilerplate.
   302  
   303  	ctx := context.Background()
   304  	stopper := stop.NewStopper()
   305  	defer stopper.Stop(ctx)
   306  
   307  	stopper, g, _, a, _ := createTestAllocator(10, false /* deterministic */)
   308  	defer stopper.Stop(context.Background())
   309  	gossiputil.NewStoreGossiper(g).GossipStores(noLocalityStores, t)
   310  	storeList, _, _ := a.storePool.getStoreList(storeFilterThrottled)
   311  	storeMap := storeListToMap(storeList)
   312  
   313  	const minQPS = 800
   314  	const maxQPS = 1200
   315  
   316  	localDesc := *noLocalityStores[0]
   317  	cfg := TestStoreConfig(nil)
   318  	s := createTestStoreWithoutStart(t, stopper, testStoreOpts{createSystemRanges: true}, &cfg)
   319  	s.Ident = &roachpb.StoreIdent{StoreID: localDesc.StoreID}
   320  	rq := newReplicateQueue(s, g, a)
   321  	rr := newReplicaRankings()
   322  
   323  	sr := NewStoreRebalancer(cfg.AmbientCtx, cfg.Settings, rq, rr)
   324  
   325  	// Load in a range with replicas on an overfull node, a slightly underfull
   326  	// node, and a very underfull node.
   327  	loadRanges(rr, s, []testRange{{storeIDs: []roachpb.StoreID{1, 4, 5}, qps: 100}})
   328  	hottestRanges := rr.topQPS()
   329  	repl := hottestRanges[0].repl
   330  
   331  	// Set up a fake RaftStatus that indicates s5 is behind (but all other stores
   332  	// are caught up). We thus shouldn't transfer a lease to s5.
   333  	sr.getRaftStatusFn = func(r *Replica) *raft.Status {
   334  		status := &raft.Status{
   335  			Progress: make(map[uint64]tracker.Progress),
   336  		}
   337  		status.Lead = uint64(r.ReplicaID())
   338  		status.Commit = 1
   339  		for _, replica := range r.Desc().InternalReplicas {
   340  			match := uint64(1)
   341  			if replica.StoreID == roachpb.StoreID(5) {
   342  				match = 0
   343  			}
   344  			status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{
   345  				Match: match,
   346  				State: tracker.StateReplicate,
   347  			}
   348  		}
   349  		return status
   350  	}
   351  
   352  	_, target, _ := sr.chooseLeaseToTransfer(
   353  		ctx, &hottestRanges, &localDesc, storeList, storeMap, minQPS, maxQPS)
   354  	expectTarget := roachpb.StoreID(4)
   355  	if target.StoreID != expectTarget {
   356  		t.Errorf("got target store s%d for range with RaftStatus %v; want s%d",
   357  			target.StoreID, sr.getRaftStatusFn(repl), expectTarget)
   358  	}
   359  
   360  	// Then do the same, but for replica rebalancing. Make s5 an existing replica
   361  	// that's behind, and see how a new replica is preferred as the leaseholder
   362  	// over it.
   363  	loadRanges(rr, s, []testRange{{storeIDs: []roachpb.StoreID{1, 3, 5}, qps: 100}})
   364  	hottestRanges = rr.topQPS()
   365  	repl = hottestRanges[0].repl
   366  
   367  	_, targets := sr.chooseReplicaToRebalance(
   368  		ctx, &hottestRanges, &localDesc, storeList, storeMap, minQPS, maxQPS)
   369  	expectTargets := []roachpb.ReplicationTarget{
   370  		{NodeID: 4, StoreID: 4}, {NodeID: 5, StoreID: 5}, {NodeID: 3, StoreID: 3},
   371  	}
   372  	if !reflect.DeepEqual(targets, expectTargets) {
   373  		t.Errorf("got targets %v for range with RaftStatus %v; want %v",
   374  			targets, sr.getRaftStatusFn(repl), expectTargets)
   375  	}
   376  }