github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/allocator.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 gosql "database/sql" 16 "fmt" 17 "math" 18 "strings" 19 "time" 20 21 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 22 "github.com/cockroachdb/errors" 23 ) 24 25 func registerAllocator(r *testRegistry) { 26 runAllocator := func(ctx context.Context, t *test, c *cluster, start int, maxStdDev float64) { 27 const fixturePath = `gs://cockroach-fixtures/workload/tpch/scalefactor=10/backup` 28 c.Put(ctx, cockroach, "./cockroach") 29 c.Put(ctx, workload, "./workload") 30 31 // Start the first `start` nodes and restore the fixture 32 args := startArgs("--args=--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5") 33 c.Start(ctx, t, c.Range(1, start), args) 34 db := c.Conn(ctx, 1) 35 defer db.Close() 36 37 m := newMonitor(ctx, c, c.Range(1, start)) 38 m.Go(func(ctx context.Context) error { 39 t.Status("loading fixture") 40 if _, err := db.Exec(`RESTORE DATABASE tpch FROM $1`, fixturePath); err != nil { 41 t.Fatal(err) 42 } 43 return nil 44 }) 45 m.Wait() 46 47 // Start the remaining nodes to kick off upreplication/rebalancing. 48 c.Start(ctx, t, c.Range(start+1, c.spec.NodeCount), args) 49 50 c.Run(ctx, c.Node(1), `./workload init kv --drop`) 51 for node := 1; node <= c.spec.NodeCount; node++ { 52 node := node 53 // TODO(dan): Ideally, the test would fail if this queryload failed, 54 // but we can't put it in monitor as-is because the test deadlocks. 55 go func() { 56 const cmd = `./workload run kv --tolerate-errors --min-block-bytes=8 --max-block-bytes=127` 57 l, err := t.l.ChildLogger(fmt.Sprintf(`kv-%d`, node)) 58 if err != nil { 59 t.Fatal(err) 60 } 61 defer l.close() 62 _ = execCmd(ctx, t.l, roachprod, "ssh", c.makeNodes(c.Node(node)), "--", cmd) 63 }() 64 } 65 66 m = newMonitor(ctx, c, c.All()) 67 m.Go(func(ctx context.Context) error { 68 t.Status("waiting for reblance") 69 return waitForRebalance(ctx, t.l, db, maxStdDev) 70 }) 71 m.Wait() 72 } 73 74 r.Add(testSpec{ 75 Name: `replicate/up/1to3`, 76 Owner: OwnerKV, 77 Cluster: makeClusterSpec(3), 78 Run: func(ctx context.Context, t *test, c *cluster) { 79 runAllocator(ctx, t, c, 1, 10.0) 80 }, 81 }) 82 r.Add(testSpec{ 83 Name: `replicate/rebalance/3to5`, 84 Owner: OwnerKV, 85 Cluster: makeClusterSpec(5), 86 Run: func(ctx context.Context, t *test, c *cluster) { 87 runAllocator(ctx, t, c, 3, 42.0) 88 }, 89 }) 90 r.Add(testSpec{ 91 Name: `replicate/wide`, 92 Owner: OwnerKV, 93 Timeout: 10 * time.Minute, 94 Cluster: makeClusterSpec(9, cpu(1)), 95 Run: runWideReplication, 96 }) 97 } 98 99 // printRebalanceStats prints the time it took for rebalancing to finish and the 100 // final standard deviation of replica counts across stores. 101 func printRebalanceStats(l *logger, db *gosql.DB) error { 102 // TODO(cuongdo): Output these in a machine-friendly way and graph. 103 104 // Output time it took to rebalance. 105 { 106 var rebalanceIntervalStr string 107 if err := db.QueryRow( 108 `SELECT (SELECT max(timestamp) FROM system.rangelog) - `+ 109 `(SELECT max(timestamp) FROM system.eventlog WHERE "eventType"=$1)`, 110 `node_join`, /* sql.EventLogNodeJoin */ 111 ).Scan(&rebalanceIntervalStr); err != nil { 112 return err 113 } 114 l.Printf("cluster took %s to rebalance\n", rebalanceIntervalStr) 115 } 116 117 // Output # of range events that occurred. All other things being equal, 118 // larger numbers are worse and potentially indicate thrashing. 119 { 120 var rangeEvents int64 121 q := `SELECT count(*) from system.rangelog` 122 if err := db.QueryRow(q).Scan(&rangeEvents); err != nil { 123 return err 124 } 125 l.Printf("%d range events\n", rangeEvents) 126 } 127 128 // Output standard deviation of the replica counts for all stores. 129 { 130 var stdDev float64 131 if err := db.QueryRow( 132 `SELECT stddev(range_count) FROM crdb_internal.kv_store_status`, 133 ).Scan(&stdDev); err != nil { 134 return err 135 } 136 l.Printf("stdDev(replica count) = %.2f\n", stdDev) 137 } 138 139 // Output the number of ranges on each store. 140 { 141 rows, err := db.Query(`SELECT store_id, range_count FROM crdb_internal.kv_store_status`) 142 if err != nil { 143 return err 144 } 145 defer rows.Close() 146 for rows.Next() { 147 var storeID, rangeCount int64 148 if err := rows.Scan(&storeID, &rangeCount); err != nil { 149 return err 150 } 151 l.Printf("s%d has %d ranges\n", storeID, rangeCount) 152 } 153 } 154 155 return nil 156 } 157 158 type replicationStats struct { 159 SecondsSinceLastEvent int64 160 EventType string 161 RangeID int64 162 StoreID int64 163 ReplicaCountStdDev float64 164 } 165 166 func (s replicationStats) String() string { 167 return fmt.Sprintf("last range event: %s for range %d/store %d (%ds ago)", 168 s.EventType, s.RangeID, s.StoreID, s.SecondsSinceLastEvent) 169 } 170 171 // allocatorStats returns the duration of stability (i.e. no replication 172 // changes) and the standard deviation in replica counts. Only unrecoverable 173 // errors are returned. 174 func allocatorStats(db *gosql.DB) (s replicationStats, err error) { 175 defer func() { 176 if err != nil { 177 s.ReplicaCountStdDev = math.MaxFloat64 178 } 179 }() 180 181 // NB: These are the storage.RangeLogEventType enum, but it's intentionally 182 // not used to avoid pulling in the dep. 183 eventTypes := []interface{}{ 184 `split`, `add`, `remove`, 185 } 186 187 q := `SELECT extract_duration(seconds FROM now()-timestamp), "rangeID", "storeID", "eventType"` + 188 `FROM system.rangelog WHERE "eventType" IN ($1, $2, $3) ORDER BY timestamp DESC LIMIT 1` 189 190 row := db.QueryRow(q, eventTypes...) 191 if row == nil { 192 // This should never happen, because the archived store we're starting with 193 // will always have some range events. 194 return replicationStats{}, errors.New("couldn't find any range events") 195 } 196 if err := row.Scan(&s.SecondsSinceLastEvent, &s.RangeID, &s.StoreID, &s.EventType); err != nil { 197 return replicationStats{}, err 198 } 199 200 if err := db.QueryRow( 201 `SELECT stddev(range_count) FROM crdb_internal.kv_store_status`, 202 ).Scan(&s.ReplicaCountStdDev); err != nil { 203 return replicationStats{}, err 204 } 205 206 return s, nil 207 } 208 209 // waitForRebalance waits until there's been no recent range adds, removes, and 210 // splits. We wait until the cluster is balanced or until `StableInterval` 211 // elapses, whichever comes first. Then, it prints stats about the rebalancing 212 // process. If the replica count appears unbalanced, an error is returned. 213 // 214 // This method is crude but necessary. If we were to wait until range counts 215 // were just about even, we'd miss potential post-rebalance thrashing. 216 func waitForRebalance(ctx context.Context, l *logger, db *gosql.DB, maxStdDev float64) error { 217 // const statsInterval = 20 * time.Second 218 const statsInterval = 2 * time.Second 219 const stableSeconds = 3 * 60 220 221 var statsTimer timeutil.Timer 222 defer statsTimer.Stop() 223 statsTimer.Reset(statsInterval) 224 for { 225 select { 226 case <-ctx.Done(): 227 return ctx.Err() 228 case <-statsTimer.C: 229 statsTimer.Read = true 230 stats, err := allocatorStats(db) 231 if err != nil { 232 return err 233 } 234 235 l.Printf("%v\n", stats) 236 if stableSeconds <= stats.SecondsSinceLastEvent { 237 l.Printf("replica count stddev = %f, max allowed stddev = %f\n", stats.ReplicaCountStdDev, maxStdDev) 238 if stats.ReplicaCountStdDev > maxStdDev { 239 _ = printRebalanceStats(l, db) 240 return errors.Errorf( 241 "%ds elapsed without changes, but replica count standard "+ 242 "deviation is %.2f (>%.2f)", stats.SecondsSinceLastEvent, 243 stats.ReplicaCountStdDev, maxStdDev) 244 } 245 return printRebalanceStats(l, db) 246 } 247 statsTimer.Reset(statsInterval) 248 } 249 } 250 } 251 252 func runWideReplication(ctx context.Context, t *test, c *cluster) { 253 nodes := c.spec.NodeCount 254 if nodes != 9 { 255 t.Fatalf("9-node cluster required") 256 } 257 258 args := startArgs("--env=COCKROACH_SCAN_MAX_IDLE_TIME=5ms") 259 c.Put(ctx, cockroach, "./cockroach") 260 c.Start(ctx, t, c.All(), args) 261 262 db := c.Conn(ctx, 1) 263 defer db.Close() 264 265 zones := func() []string { 266 oldVersion := false 267 rows, err := db.Query(`SELECT target FROM crdb_internal.zones`) 268 // TODO(solon): Remove this block once we are no longer running roachtest 269 // against version 19.1 and earlier. 270 if err != nil && strings.Contains(err.Error(), `column "target" does not exist`) { 271 oldVersion = true 272 rows, err = db.Query(`SELECT zone_name FROM crdb_internal.zones`) 273 } 274 if err != nil { 275 t.Fatal(err) 276 } 277 defer rows.Close() 278 var results []string 279 for rows.Next() { 280 var name string 281 if err := rows.Scan(&name); err != nil { 282 t.Fatal(err) 283 } 284 // TODO(solon): Remove this block once we are no longer running roachtest 285 // against version 19.1 and earlier. 286 if oldVersion { 287 which := "RANGE" 288 if name[0] == '.' { 289 name = name[1:] 290 } else if strings.Count(name, ".") == 0 { 291 which = "DATABASE" 292 } else { 293 which = "TABLE" 294 } 295 name = fmt.Sprintf("%s %s", which, name) 296 } 297 results = append(results, name) 298 } 299 return results 300 } 301 302 run := func(stmt string) { 303 t.l.Printf("%s\n", stmt) 304 if _, err := db.ExecContext(ctx, stmt); err != nil { 305 t.Fatal(err) 306 } 307 } 308 309 setReplication := func(width int) { 310 // Change every zone to have the same number of replicas as the number of 311 // nodes in the cluster. 312 for _, zone := range zones() { 313 run(fmt.Sprintf(`ALTER %s CONFIGURE ZONE USING num_replicas = %d`, zone, width)) 314 } 315 } 316 setReplication(nodes) 317 318 countMisreplicated := func(width int) int { 319 var count int 320 if err := db.QueryRow( 321 "SELECT count(*) FROM crdb_internal.ranges WHERE array_length(replicas,1) != $1", 322 width, 323 ).Scan(&count); err != nil { 324 t.Fatal(err) 325 } 326 return count 327 } 328 329 waitForReplication := func(width int) { 330 for count := -1; count != 0; time.Sleep(time.Second) { 331 count = countMisreplicated(width) 332 t.l.Printf("%d mis-replicated ranges\n", count) 333 } 334 } 335 336 waitForReplication(nodes) 337 338 numRanges := func() int { 339 var count int 340 if err := db.QueryRow(`SELECT count(*) FROM crdb_internal.ranges`).Scan(&count); err != nil { 341 t.Fatal(err) 342 } 343 return count 344 }() 345 346 // Stop the cluster and restart 2/3 of the nodes. 347 c.Stop(ctx) 348 c.Start(ctx, t, c.Range(1, 6), args) 349 350 waitForUnderReplicated := func(count int) { 351 for start := timeutil.Now(); ; time.Sleep(time.Second) { 352 query := ` 353 SELECT sum((metrics->>'ranges.unavailable')::DECIMAL)::INT AS ranges_unavailable, 354 sum((metrics->>'ranges.underreplicated')::DECIMAL)::INT AS ranges_underreplicated 355 FROM crdb_internal.kv_store_status 356 ` 357 var unavailable, underReplicated int 358 if err := db.QueryRow(query).Scan(&unavailable, &underReplicated); err != nil { 359 t.Fatal(err) 360 } 361 t.l.Printf("%d unavailable, %d under-replicated ranges\n", unavailable, underReplicated) 362 if unavailable != 0 { 363 // A freshly started cluster might show unavailable ranges for a brief 364 // period of time due to the way that metric is calculated. Only 365 // complain about unavailable ranges if they persist for too long. 366 if timeutil.Since(start) >= 30*time.Second { 367 t.Fatalf("%d unavailable ranges", unavailable) 368 } 369 continue 370 } 371 if underReplicated >= count { 372 break 373 } 374 } 375 } 376 377 waitForUnderReplicated(numRanges) 378 if n := countMisreplicated(9); n != 0 { 379 t.Fatalf("expected 0 mis-replicated ranges, but found %d", n) 380 } 381 382 decom := func(id int) { 383 c.Run(ctx, c.Node(1), 384 fmt.Sprintf("./cockroach node decommission --insecure --wait=none %d", id)) 385 } 386 387 // Decommission a node. The ranges should down-replicate to 7 replicas. 388 decom(9) 389 waitForReplication(7) 390 391 // Set the replication width to 5. The replicas should down-replicate, though 392 // this currently requires the time-until-store-dead threshold to pass 393 // because the allocator cannot select a replica for removal that is on a 394 // store for which it doesn't have a store descriptor. 395 run(`SET CLUSTER SETTING server.time_until_store_dead = '90s'`) 396 setReplication(5) 397 waitForReplication(5) 398 399 // Restart the down nodes to prevent the dead node detector from complaining. 400 c.Start(ctx, t, c.Range(7, 9)) 401 }