github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/zerosum/main.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "bytes" 15 "context" 16 gosql "database/sql" 17 "flag" 18 "fmt" 19 "math/rand" 20 "os" 21 "os/signal" 22 "runtime" 23 "strings" 24 "sync/atomic" 25 "syscall" 26 "time" 27 28 "github.com/cockroachdb/cockroach-go/crdb" 29 "github.com/cockroachdb/cockroach/pkg/acceptance/cluster" 30 "github.com/cockroachdb/cockroach/pkg/acceptance/localcluster" 31 "github.com/cockroachdb/cockroach/pkg/keys" 32 "github.com/cockroachdb/cockroach/pkg/util/encoding" 33 "github.com/cockroachdb/cockroach/pkg/util/log" 34 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 35 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 36 ) 37 38 var workers = flag.Int("w", 2*runtime.NumCPU(), "number of workers") 39 var monkeys = flag.Int("m", 3, "number of monkeys") 40 var numNodes = flag.Int("n", 4, "number of nodes") 41 var numAccounts = flag.Int("a", 1e5, "number of accounts") 42 var chaosType = flag.String("c", "simple", "chaos type [none|simple|flappy]") 43 var verify = flag.Bool("verify", true, "verify range and account consistency") 44 45 func newRand() *rand.Rand { 46 return rand.New(rand.NewSource(timeutil.Now().UnixNano())) 47 } 48 49 // zeroSum is a bank-like simulation that tests correctness in the face of 50 // aggressive splits and lease transfers. A pool of workers chooses two random 51 // accounts and increments the balance in one while decrementing the balance in 52 // the other (leaving the total balance as zero, hence the name). A pool of 53 // monkeys splits ranges and moves leases every second or so. Periodically, we 54 // perform full cluster consistency checks as well as verify that the total 55 // balance in the accounts table is zero. 56 // 57 // The account IDs used by workers and chosen as split points are selected from 58 // a zipf distribution which tilts towards smaller IDs (and hence more 59 // contention). 60 type zeroSum struct { 61 *localcluster.LocalCluster 62 numAccounts int 63 chaosType string 64 accounts struct { 65 syncutil.Mutex 66 m map[uint64]struct{} 67 } 68 stats struct { 69 ops uint64 70 errors uint64 71 splits uint64 72 transfers uint64 73 } 74 ranges struct { 75 syncutil.Mutex 76 count int 77 replicas []int 78 } 79 } 80 81 func newZeroSum(c *localcluster.LocalCluster, numAccounts int, chaosType string) *zeroSum { 82 z := &zeroSum{ 83 LocalCluster: c, 84 numAccounts: numAccounts, 85 chaosType: chaosType, 86 } 87 z.accounts.m = make(map[uint64]struct{}) 88 return z 89 } 90 91 func (z *zeroSum) run(workers, monkeys int) { 92 tableID := z.setup() 93 for i := 0; i < workers; i++ { 94 go z.worker() 95 } 96 for i := 0; i < monkeys; i++ { 97 go z.monkey(tableID, 2*time.Second) 98 } 99 if workers > 0 || monkeys > 0 { 100 z.chaos() 101 if *verify { 102 go z.check(20 * time.Second) 103 go z.verify(10 * time.Second) 104 } 105 } 106 go z.rangeStats(time.Second) 107 z.monitor(time.Second) 108 } 109 110 func (z *zeroSum) setup() uint32 { 111 db := z.Nodes[0].DB() 112 if _, err := db.Exec("CREATE DATABASE IF NOT EXISTS zerosum"); err != nil { 113 log.Fatalf(context.Background(), "%v", err) 114 } 115 116 accounts := ` 117 CREATE TABLE IF NOT EXISTS accounts ( 118 id INT PRIMARY KEY, 119 balance INT NOT NULL 120 ) 121 ` 122 if _, err := db.Exec(accounts); err != nil { 123 log.Fatalf(context.Background(), "%v", err) 124 } 125 126 tableIDQuery := ` 127 SELECT tables.id FROM system.namespace tables 128 JOIN system.namespace dbs ON dbs.id = tables."parentID" 129 WHERE dbs.name = $1 AND tables.name = $2 130 ` 131 var tableID uint32 132 if err := db.QueryRow(tableIDQuery, "zerosum", "accounts").Scan(&tableID); err != nil { 133 log.Fatalf(context.Background(), "%v", err) 134 } 135 return tableID 136 } 137 138 func (z *zeroSum) accountDistribution(r *rand.Rand) *rand.Zipf { 139 // We use a Zipf distribution for selecting accounts. 140 return rand.NewZipf(r, 1.1, float64(z.numAccounts/10), uint64(z.numAccounts-1)) 141 } 142 143 func (z *zeroSum) accountsLen() int { 144 z.accounts.Lock() 145 defer z.accounts.Unlock() 146 return len(z.accounts.m) 147 } 148 149 func (z *zeroSum) maybeLogError(err error) { 150 if localcluster.IsUnavailableError(err) || strings.Contains(err.Error(), "range is frozen") { 151 return 152 } 153 log.Errorf(context.Background(), "%v", err) 154 atomic.AddUint64(&z.stats.errors, 1) 155 } 156 157 func (z *zeroSum) worker() { 158 r := newRand() 159 zipf := z.accountDistribution(r) 160 161 for { 162 from := zipf.Uint64() 163 to := zipf.Uint64() 164 if from == to { 165 continue 166 } 167 168 db := z.Nodes[z.RandNode(r.Intn)].DB() 169 if db == nil { 170 // Node is currently offline. 171 continue 172 } 173 err := crdb.ExecuteTx(context.Background(), db, nil, func(tx *gosql.Tx) error { 174 rows, err := tx.Query(`SELECT id, balance FROM accounts WHERE id IN ($1, $2)`, from, to) 175 if err != nil { 176 return err 177 } 178 179 var fromBalance, toBalance int64 180 for rows.Next() { 181 var id uint64 182 var balance int64 183 if err = rows.Scan(&id, &balance); err != nil { 184 log.Fatalf(context.Background(), "%v", err) 185 } 186 switch id { 187 case from: 188 fromBalance = balance 189 case to: 190 toBalance = balance 191 default: 192 panic(fmt.Sprintf("got unexpected account %d", id)) 193 } 194 } 195 196 upsert := `UPSERT INTO accounts VALUES ($1, $3), ($2, $4)` 197 _, err = tx.Exec(upsert, to, from, toBalance+1, fromBalance-1) 198 return err 199 }) 200 if err != nil { 201 z.maybeLogError(err) 202 } else { 203 atomic.AddUint64(&z.stats.ops, 1) 204 z.accounts.Lock() 205 z.accounts.m[from] = struct{}{} 206 z.accounts.m[to] = struct{}{} 207 z.accounts.Unlock() 208 } 209 } 210 } 211 212 func (z *zeroSum) monkey(tableID uint32, d time.Duration) { 213 r := newRand() 214 zipf := z.accountDistribution(r) 215 216 for { 217 time.Sleep(time.Duration(rand.Float64() * float64(d))) 218 219 key := keys.SystemSQLCodec.TablePrefix(tableID) 220 key = encoding.EncodeVarintAscending(key, int64(zipf.Uint64())) 221 222 switch r.Intn(2) { 223 case 0: 224 if err := z.Split(z.RandNode(r.Intn), key); err != nil { 225 z.maybeLogError(err) 226 } else { 227 atomic.AddUint64(&z.stats.splits, 1) 228 } 229 case 1: 230 if transferred, err := z.TransferLease(z.RandNode(r.Intn), r, key); err != nil { 231 z.maybeLogError(err) 232 } else if transferred { 233 atomic.AddUint64(&z.stats.transfers, 1) 234 } 235 } 236 } 237 } 238 239 func (z *zeroSum) chaosSimple() { 240 d := 15 * time.Second 241 fmt.Printf("chaos(simple): first event in %s\n", d) 242 time.Sleep(d) 243 244 nodeIdx := 0 245 node := z.Nodes[nodeIdx] 246 d = 20 * time.Second 247 fmt.Printf("chaos: killing node %d for %s\n", nodeIdx+1, d) 248 node.Kill() 249 250 time.Sleep(d) 251 fmt.Printf("chaos: starting node %d\n", nodeIdx+1) 252 node.Start(context.Background()) 253 } 254 255 func (z *zeroSum) chaosFlappy() { 256 r := newRand() 257 d := time.Duration(15+r.Intn(30)) * time.Second 258 fmt.Printf("chaos(flappy): first event in %s\n", d) 259 260 for i := 1; true; i++ { 261 time.Sleep(d) 262 263 nodeIdx := z.RandNode(r.Intn) 264 node := z.Nodes[nodeIdx] 265 d = time.Duration(15+r.Intn(30)) * time.Second 266 fmt.Printf("chaos %d: killing node %d for %s\n", i, nodeIdx+1, d) 267 node.Kill() 268 269 time.Sleep(d) 270 271 d = time.Duration(15+r.Intn(30)) * time.Second 272 fmt.Printf("chaos %d: starting node %d, next event in %s\n", i, nodeIdx+1, d) 273 node.Start(context.Background()) 274 } 275 } 276 277 func (z *zeroSum) chaos() { 278 switch z.chaosType { 279 case "none": 280 // nothing to do 281 case "simple": 282 go z.chaosSimple() 283 case "flappy": 284 go z.chaosFlappy() 285 default: 286 log.Fatalf(context.Background(), "unknown chaos type: %s", z.chaosType) 287 } 288 } 289 290 func (z *zeroSum) check(d time.Duration) { 291 for { 292 time.Sleep(d) 293 if err := cluster.Consistent(context.Background(), z.LocalCluster, z.RandNode(rand.Intn)); err != nil { 294 z.maybeLogError(err) 295 } 296 } 297 } 298 299 func (z *zeroSum) verify(d time.Duration) { 300 for { 301 time.Sleep(d) 302 303 // Grab the count of accounts from committed transactions first. The number 304 // of accounts found by the SELECT should be at least this number. 305 committedAccounts := uint64(z.accountsLen()) 306 307 q := `SELECT count(*), sum(balance) FROM accounts` 308 var accounts uint64 309 var total int64 310 db := z.Nodes[z.RandNode(rand.Intn)].DB() 311 if err := db.QueryRow(q).Scan(&accounts, &total); err != nil { 312 z.maybeLogError(err) 313 continue 314 } 315 if total != 0 { 316 log.Fatalf(context.Background(), "unexpected total balance %d", total) 317 } 318 if accounts < committedAccounts { 319 log.Fatalf(context.Background(), "expected at least %d accounts, but found %d", 320 committedAccounts, accounts) 321 } 322 } 323 } 324 325 func (z *zeroSum) rangeInfo() (int, []int) { 326 replicas := make([]int, len(z.Nodes)) 327 db, err := z.NewDB(context.Background(), z.RandNode(rand.Intn)) 328 if err != nil { 329 z.maybeLogError(err) 330 return -1, replicas 331 } 332 rows, err := db.Query(`SELECT array_length(replicas, 1) FROM crdb_internal.ranges`) 333 if err != nil { 334 z.maybeLogError(err) 335 return -1, replicas 336 } 337 defer rows.Close() 338 339 var count int 340 for rows.Next() { 341 var numReplicas int 342 if err := rows.Scan(&numReplicas); err != nil { 343 z.maybeLogError(err) 344 return -1, replicas 345 } 346 for i := 0; i < numReplicas; i++ { 347 replicas[i]++ 348 } 349 count++ 350 } 351 352 return count, replicas 353 } 354 355 func (z *zeroSum) rangeStats(d time.Duration) { 356 for { 357 count, replicas := z.rangeInfo() 358 z.ranges.Lock() 359 z.ranges.count, z.ranges.replicas = count, replicas 360 z.ranges.Unlock() 361 362 time.Sleep(d) 363 } 364 } 365 366 func (z *zeroSum) formatReplicas(replicas []int) string { 367 var buf bytes.Buffer 368 for i := range replicas { 369 if i > 0 { 370 _, _ = buf.WriteString(" ") 371 } 372 fmt.Fprintf(&buf, "%d", replicas[i]) 373 if !z.Nodes[i].Alive() { 374 _, _ = buf.WriteString("*") 375 } 376 } 377 return buf.String() 378 } 379 380 func (z *zeroSum) monitor(d time.Duration) { 381 start := timeutil.Now() 382 lastTime := start 383 var lastOps uint64 384 385 for ticks := 0; true; ticks++ { 386 time.Sleep(d) 387 388 if ticks%20 == 0 { 389 fmt.Printf("_elapsed__accounts_________ops__ops/sec___errors___splits____xfers___ranges_____________replicas\n") 390 } 391 392 now := timeutil.Now() 393 elapsed := now.Sub(lastTime).Seconds() 394 ops := atomic.LoadUint64(&z.stats.ops) 395 396 z.ranges.Lock() 397 ranges, replicas := z.ranges.count, z.ranges.replicas 398 z.ranges.Unlock() 399 400 fmt.Printf("%8s %9d %11d %8.1f %8d %8d %8d %8d %20s\n", 401 time.Duration(now.Sub(start).Seconds()+0.5)*time.Second, 402 z.accountsLen(), ops, float64(ops-lastOps)/elapsed, 403 atomic.LoadUint64(&z.stats.errors), 404 atomic.LoadUint64(&z.stats.splits), 405 atomic.LoadUint64(&z.stats.transfers), 406 ranges, z.formatReplicas(replicas)) 407 lastTime = now 408 lastOps = ops 409 } 410 } 411 412 func main() { 413 flag.Parse() 414 415 cockroachBin := func() string { 416 bin := "./cockroach" 417 if _, err := os.Stat(bin); os.IsNotExist(err) { 418 bin = "cockroach" 419 } else if err != nil { 420 panic(err) 421 } 422 return bin 423 }() 424 425 perNodeCfg := localcluster.MakePerNodeFixedPortsCfg(*numNodes) 426 427 cfg := localcluster.ClusterConfig{ 428 DataDir: "cockroach-data-zerosum", 429 Binary: cockroachBin, 430 NumNodes: *numNodes, 431 NumWorkers: *workers, 432 AllNodeArgs: flag.Args(), 433 DB: "zerosum", 434 PerNodeCfg: perNodeCfg, 435 } 436 437 c := &localcluster.LocalCluster{Cluster: localcluster.New(cfg)} 438 defer c.Close() 439 440 log.SetExitFunc(false /* hideStack */, func(code int) { 441 c.Close() 442 os.Exit(code) 443 }) 444 445 signalCh := make(chan os.Signal, 1) 446 signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) 447 448 go func() { 449 s := <-signalCh 450 log.Infof(context.Background(), "signal received: %v", s) 451 c.Close() 452 os.Exit(1) 453 }() 454 455 c.Start(context.Background()) 456 457 z := newZeroSum(c, *numAccounts, *chaosType) 458 z.run(*workers, *monkeys) 459 }