github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/rebalance_load.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 gosql "database/sql" 16 "fmt" 17 "sort" 18 "strings" 19 "time" 20 21 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 22 "github.com/cockroachdb/errors" 23 "golang.org/x/sync/errgroup" 24 ) 25 26 func registerRebalanceLoad(r *testRegistry) { 27 // This test creates a single table for kv to use and splits the table to 28 // have one range for every node in the cluster. Because even brand new 29 // clusters start with 20+ ranges in them, the number of new ranges in kv's 30 // table is small enough that it typically won't trigger rebalancing of 31 // leases in the cluster based on lease count alone. We let kv generate a lot 32 // of load against the ranges such that when 33 // kv.allocator.stat_based_rebalancing.enabled is set to true, we'd expect 34 // load-based rebalancing to distribute the load evenly across the nodes in 35 // the cluster. Without that setting, the fact that the kv table has so few 36 // ranges means that they probablistically won't have their leases evenly 37 // spread across all the nodes (they'll often just end up staying on n1). 38 // 39 // In other words, this test should always pass with 40 // kv.allocator.stat_based_rebalancing.enabled set to true, while it should 41 // usually (but not always fail) with it set to false. 42 rebalanceLoadRun := func( 43 ctx context.Context, 44 t *test, 45 c *cluster, 46 rebalanceMode string, 47 maxDuration time.Duration, 48 concurrency int, 49 ) { 50 roachNodes := c.Range(1, c.spec.NodeCount-1) 51 appNode := c.Node(c.spec.NodeCount) 52 splits := len(roachNodes) - 1 // n-1 splits => n ranges => 1 lease per node 53 54 c.Put(ctx, cockroach, "./cockroach", roachNodes) 55 args := startArgs( 56 "--args=--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5") 57 c.Start(ctx, t, roachNodes, args) 58 59 c.Put(ctx, workload, "./workload", appNode) 60 c.Run(ctx, appNode, fmt.Sprintf("./workload init kv --drop --splits=%d {pgurl:1}", splits)) 61 62 var m *errgroup.Group // see comment in version.go 63 m, ctx = errgroup.WithContext(ctx) 64 65 // Enable us to exit out of workload early when we achieve the desired 66 // lease balance. This drastically shortens the duration of the test in the 67 // common case. 68 ctx, cancel := context.WithCancel(ctx) 69 70 m.Go(func() error { 71 t.l.Printf("starting load generator\n") 72 73 err := c.RunE(ctx, appNode, fmt.Sprintf( 74 "./workload run kv --read-percent=95 --tolerate-errors --concurrency=%d "+ 75 "--duration=%v {pgurl:1-%d}", 76 concurrency, maxDuration, len(roachNodes))) 77 if errors.Is(ctx.Err(), context.Canceled) { 78 // We got canceled either because lease balance was achieved or the 79 // other worker hit an error. In either case, it's not this worker's 80 // fault. 81 return nil 82 } 83 return err 84 }) 85 86 m.Go(func() error { 87 t.Status("checking for lease balance") 88 89 db := c.Conn(ctx, 1) 90 defer db.Close() 91 92 t.Status("disable load based splitting") 93 if err := disableLoadBasedSplitting(ctx, db); err != nil { 94 return err 95 } 96 97 if _, err := db.ExecContext( 98 ctx, `SET CLUSTER SETTING kv.allocator.load_based_rebalancing=$1::string`, rebalanceMode, 99 ); err != nil { 100 return err 101 } 102 103 for tBegin := timeutil.Now(); timeutil.Since(tBegin) <= maxDuration; { 104 if done, err := isLoadEvenlyDistributed(t.l, db, len(roachNodes)); err != nil { 105 return err 106 } else if done { 107 t.Status("successfully achieved lease balance; waiting for kv to finish running") 108 cancel() 109 return nil 110 } 111 112 select { 113 case <-ctx.Done(): 114 return ctx.Err() 115 case <-time.After(5 * time.Second): 116 } 117 } 118 119 return fmt.Errorf("timed out before leases were evenly spread") 120 }) 121 if err := m.Wait(); err != nil { 122 t.Fatal(err) 123 } 124 } 125 126 concurrency := 128 127 128 r.Add(testSpec{ 129 Name: `rebalance/by-load/leases`, 130 Owner: OwnerKV, 131 Cluster: makeClusterSpec(4), // the last node is just used to generate load 132 MinVersion: "v2.1.0", 133 Run: func(ctx context.Context, t *test, c *cluster) { 134 if local { 135 concurrency = 32 136 fmt.Printf("lowering concurrency to %d in local testing\n", concurrency) 137 } 138 rebalanceLoadRun(ctx, t, c, "leases", 3*time.Minute, concurrency) 139 }, 140 }) 141 r.Add(testSpec{ 142 Name: `rebalance/by-load/replicas`, 143 Owner: OwnerKV, 144 Cluster: makeClusterSpec(7), // the last node is just used to generate load 145 MinVersion: "v2.1.0", 146 Run: func(ctx context.Context, t *test, c *cluster) { 147 if local { 148 concurrency = 32 149 fmt.Printf("lowering concurrency to %d in local testing\n", concurrency) 150 } 151 rebalanceLoadRun(ctx, t, c, "leases and replicas", 5*time.Minute, concurrency) 152 }, 153 }) 154 } 155 156 func isLoadEvenlyDistributed(l *logger, db *gosql.DB, numNodes int) (bool, error) { 157 rows, err := db.Query( 158 `select lease_holder, count(*) ` + 159 `from [show ranges from table kv.kv] ` + 160 `group by lease_holder;`) 161 if err != nil { 162 // TODO(rafi): Remove experimental_ranges query once we stop testing 19.1 or 163 // earlier. 164 if strings.Contains(err.Error(), "syntax error at or near \"ranges\"") { 165 rows, err = db.Query( 166 `select lease_holder, count(*) ` + 167 `from [show experimental_ranges from table kv.kv] ` + 168 `group by lease_holder;`) 169 } 170 } 171 if err != nil { 172 return false, err 173 } 174 defer rows.Close() 175 leaseCounts := make(map[int]int) 176 var rangeCount int 177 for rows.Next() { 178 var storeID, leaseCount int 179 if err := rows.Scan(&storeID, &leaseCount); err != nil { 180 return false, err 181 } 182 leaseCounts[storeID] = leaseCount 183 rangeCount += leaseCount 184 } 185 186 if len(leaseCounts) < numNodes { 187 l.Printf("not all nodes have a lease yet: %v\n", formatLeaseCounts(leaseCounts)) 188 return false, nil 189 } 190 191 // The simple case is when ranges haven't split. We can require that every 192 // store has one lease. 193 if rangeCount == numNodes { 194 for _, leaseCount := range leaseCounts { 195 if leaseCount != 1 { 196 l.Printf("uneven lease distribution: %s\n", formatLeaseCounts(leaseCounts)) 197 return false, nil 198 } 199 } 200 l.Printf("leases successfully distributed: %s\n", formatLeaseCounts(leaseCounts)) 201 return true, nil 202 } 203 204 // For completeness, if leases have split, verify the leases per store don't 205 // differ by any more than 1. 206 leases := make([]int, 0, numNodes) 207 for _, leaseCount := range leaseCounts { 208 leases = append(leases, leaseCount) 209 } 210 sort.Ints(leases) 211 if leases[0]+1 < leases[len(leases)-1] { 212 l.Printf("leases per store differ by more than one: %s\n", formatLeaseCounts(leaseCounts)) 213 return false, nil 214 } 215 216 l.Printf("leases successfully distributed: %s\n", formatLeaseCounts(leaseCounts)) 217 return true, nil 218 } 219 220 func formatLeaseCounts(counts map[int]int) string { 221 storeIDs := make([]int, 0, len(counts)) 222 for storeID := range counts { 223 storeIDs = append(storeIDs, storeID) 224 } 225 sort.Ints(storeIDs) 226 strs := make([]string, 0, len(counts)) 227 for _, storeID := range storeIDs { 228 strs = append(strs, fmt.Sprintf("s%d: %d", storeID, counts[storeID])) 229 } 230 return fmt.Sprintf("[%s]", strings.Join(strs, ", ")) 231 }