github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/zerosum/main.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	gosql "database/sql"
    17  	"flag"
    18  	"fmt"
    19  	"math/rand"
    20  	"os"
    21  	"os/signal"
    22  	"runtime"
    23  	"strings"
    24  	"sync/atomic"
    25  	"syscall"
    26  	"time"
    27  
    28  	"github.com/cockroachdb/cockroach-go/crdb"
    29  	"github.com/cockroachdb/cockroach/pkg/acceptance/cluster"
    30  	"github.com/cockroachdb/cockroach/pkg/acceptance/localcluster"
    31  	"github.com/cockroachdb/cockroach/pkg/keys"
    32  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    33  	"github.com/cockroachdb/cockroach/pkg/util/log"
    34  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    35  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    36  )
    37  
    38  var workers = flag.Int("w", 2*runtime.NumCPU(), "number of workers")
    39  var monkeys = flag.Int("m", 3, "number of monkeys")
    40  var numNodes = flag.Int("n", 4, "number of nodes")
    41  var numAccounts = flag.Int("a", 1e5, "number of accounts")
    42  var chaosType = flag.String("c", "simple", "chaos type [none|simple|flappy]")
    43  var verify = flag.Bool("verify", true, "verify range and account consistency")
    44  
    45  func newRand() *rand.Rand {
    46  	return rand.New(rand.NewSource(timeutil.Now().UnixNano()))
    47  }
    48  
    49  // zeroSum is a bank-like simulation that tests correctness in the face of
    50  // aggressive splits and lease transfers. A pool of workers chooses two random
    51  // accounts and increments the balance in one while decrementing the balance in
    52  // the other (leaving the total balance as zero, hence the name). A pool of
    53  // monkeys splits ranges and moves leases every second or so. Periodically, we
    54  // perform full cluster consistency checks as well as verify that the total
    55  // balance in the accounts table is zero.
    56  //
    57  // The account IDs used by workers and chosen as split points are selected from
    58  // a zipf distribution which tilts towards smaller IDs (and hence more
    59  // contention).
    60  type zeroSum struct {
    61  	*localcluster.LocalCluster
    62  	numAccounts int
    63  	chaosType   string
    64  	accounts    struct {
    65  		syncutil.Mutex
    66  		m map[uint64]struct{}
    67  	}
    68  	stats struct {
    69  		ops       uint64
    70  		errors    uint64
    71  		splits    uint64
    72  		transfers uint64
    73  	}
    74  	ranges struct {
    75  		syncutil.Mutex
    76  		count    int
    77  		replicas []int
    78  	}
    79  }
    80  
    81  func newZeroSum(c *localcluster.LocalCluster, numAccounts int, chaosType string) *zeroSum {
    82  	z := &zeroSum{
    83  		LocalCluster: c,
    84  		numAccounts:  numAccounts,
    85  		chaosType:    chaosType,
    86  	}
    87  	z.accounts.m = make(map[uint64]struct{})
    88  	return z
    89  }
    90  
    91  func (z *zeroSum) run(workers, monkeys int) {
    92  	tableID := z.setup()
    93  	for i := 0; i < workers; i++ {
    94  		go z.worker()
    95  	}
    96  	for i := 0; i < monkeys; i++ {
    97  		go z.monkey(tableID, 2*time.Second)
    98  	}
    99  	if workers > 0 || monkeys > 0 {
   100  		z.chaos()
   101  		if *verify {
   102  			go z.check(20 * time.Second)
   103  			go z.verify(10 * time.Second)
   104  		}
   105  	}
   106  	go z.rangeStats(time.Second)
   107  	z.monitor(time.Second)
   108  }
   109  
   110  func (z *zeroSum) setup() uint32 {
   111  	db := z.Nodes[0].DB()
   112  	if _, err := db.Exec("CREATE DATABASE IF NOT EXISTS zerosum"); err != nil {
   113  		log.Fatalf(context.Background(), "%v", err)
   114  	}
   115  
   116  	accounts := `
   117  CREATE TABLE IF NOT EXISTS accounts (
   118    id INT PRIMARY KEY,
   119    balance INT NOT NULL
   120  )
   121  `
   122  	if _, err := db.Exec(accounts); err != nil {
   123  		log.Fatalf(context.Background(), "%v", err)
   124  	}
   125  
   126  	tableIDQuery := `
   127  SELECT tables.id FROM system.namespace tables
   128    JOIN system.namespace dbs ON dbs.id = tables."parentID"
   129    WHERE dbs.name = $1 AND tables.name = $2
   130  `
   131  	var tableID uint32
   132  	if err := db.QueryRow(tableIDQuery, "zerosum", "accounts").Scan(&tableID); err != nil {
   133  		log.Fatalf(context.Background(), "%v", err)
   134  	}
   135  	return tableID
   136  }
   137  
   138  func (z *zeroSum) accountDistribution(r *rand.Rand) *rand.Zipf {
   139  	// We use a Zipf distribution for selecting accounts.
   140  	return rand.NewZipf(r, 1.1, float64(z.numAccounts/10), uint64(z.numAccounts-1))
   141  }
   142  
   143  func (z *zeroSum) accountsLen() int {
   144  	z.accounts.Lock()
   145  	defer z.accounts.Unlock()
   146  	return len(z.accounts.m)
   147  }
   148  
   149  func (z *zeroSum) maybeLogError(err error) {
   150  	if localcluster.IsUnavailableError(err) || strings.Contains(err.Error(), "range is frozen") {
   151  		return
   152  	}
   153  	log.Errorf(context.Background(), "%v", err)
   154  	atomic.AddUint64(&z.stats.errors, 1)
   155  }
   156  
   157  func (z *zeroSum) worker() {
   158  	r := newRand()
   159  	zipf := z.accountDistribution(r)
   160  
   161  	for {
   162  		from := zipf.Uint64()
   163  		to := zipf.Uint64()
   164  		if from == to {
   165  			continue
   166  		}
   167  
   168  		db := z.Nodes[z.RandNode(r.Intn)].DB()
   169  		if db == nil {
   170  			// Node is currently offline.
   171  			continue
   172  		}
   173  		err := crdb.ExecuteTx(context.Background(), db, nil, func(tx *gosql.Tx) error {
   174  			rows, err := tx.Query(`SELECT id, balance FROM accounts WHERE id IN ($1, $2)`, from, to)
   175  			if err != nil {
   176  				return err
   177  			}
   178  
   179  			var fromBalance, toBalance int64
   180  			for rows.Next() {
   181  				var id uint64
   182  				var balance int64
   183  				if err = rows.Scan(&id, &balance); err != nil {
   184  					log.Fatalf(context.Background(), "%v", err)
   185  				}
   186  				switch id {
   187  				case from:
   188  					fromBalance = balance
   189  				case to:
   190  					toBalance = balance
   191  				default:
   192  					panic(fmt.Sprintf("got unexpected account %d", id))
   193  				}
   194  			}
   195  
   196  			upsert := `UPSERT INTO accounts VALUES ($1, $3), ($2, $4)`
   197  			_, err = tx.Exec(upsert, to, from, toBalance+1, fromBalance-1)
   198  			return err
   199  		})
   200  		if err != nil {
   201  			z.maybeLogError(err)
   202  		} else {
   203  			atomic.AddUint64(&z.stats.ops, 1)
   204  			z.accounts.Lock()
   205  			z.accounts.m[from] = struct{}{}
   206  			z.accounts.m[to] = struct{}{}
   207  			z.accounts.Unlock()
   208  		}
   209  	}
   210  }
   211  
   212  func (z *zeroSum) monkey(tableID uint32, d time.Duration) {
   213  	r := newRand()
   214  	zipf := z.accountDistribution(r)
   215  
   216  	for {
   217  		time.Sleep(time.Duration(rand.Float64() * float64(d)))
   218  
   219  		key := keys.SystemSQLCodec.TablePrefix(tableID)
   220  		key = encoding.EncodeVarintAscending(key, int64(zipf.Uint64()))
   221  
   222  		switch r.Intn(2) {
   223  		case 0:
   224  			if err := z.Split(z.RandNode(r.Intn), key); err != nil {
   225  				z.maybeLogError(err)
   226  			} else {
   227  				atomic.AddUint64(&z.stats.splits, 1)
   228  			}
   229  		case 1:
   230  			if transferred, err := z.TransferLease(z.RandNode(r.Intn), r, key); err != nil {
   231  				z.maybeLogError(err)
   232  			} else if transferred {
   233  				atomic.AddUint64(&z.stats.transfers, 1)
   234  			}
   235  		}
   236  	}
   237  }
   238  
   239  func (z *zeroSum) chaosSimple() {
   240  	d := 15 * time.Second
   241  	fmt.Printf("chaos(simple): first event in %s\n", d)
   242  	time.Sleep(d)
   243  
   244  	nodeIdx := 0
   245  	node := z.Nodes[nodeIdx]
   246  	d = 20 * time.Second
   247  	fmt.Printf("chaos: killing node %d for %s\n", nodeIdx+1, d)
   248  	node.Kill()
   249  
   250  	time.Sleep(d)
   251  	fmt.Printf("chaos: starting node %d\n", nodeIdx+1)
   252  	node.Start(context.Background())
   253  }
   254  
   255  func (z *zeroSum) chaosFlappy() {
   256  	r := newRand()
   257  	d := time.Duration(15+r.Intn(30)) * time.Second
   258  	fmt.Printf("chaos(flappy): first event in %s\n", d)
   259  
   260  	for i := 1; true; i++ {
   261  		time.Sleep(d)
   262  
   263  		nodeIdx := z.RandNode(r.Intn)
   264  		node := z.Nodes[nodeIdx]
   265  		d = time.Duration(15+r.Intn(30)) * time.Second
   266  		fmt.Printf("chaos %d: killing node %d for %s\n", i, nodeIdx+1, d)
   267  		node.Kill()
   268  
   269  		time.Sleep(d)
   270  
   271  		d = time.Duration(15+r.Intn(30)) * time.Second
   272  		fmt.Printf("chaos %d: starting node %d, next event in %s\n", i, nodeIdx+1, d)
   273  		node.Start(context.Background())
   274  	}
   275  }
   276  
   277  func (z *zeroSum) chaos() {
   278  	switch z.chaosType {
   279  	case "none":
   280  		// nothing to do
   281  	case "simple":
   282  		go z.chaosSimple()
   283  	case "flappy":
   284  		go z.chaosFlappy()
   285  	default:
   286  		log.Fatalf(context.Background(), "unknown chaos type: %s", z.chaosType)
   287  	}
   288  }
   289  
   290  func (z *zeroSum) check(d time.Duration) {
   291  	for {
   292  		time.Sleep(d)
   293  		if err := cluster.Consistent(context.Background(), z.LocalCluster, z.RandNode(rand.Intn)); err != nil {
   294  			z.maybeLogError(err)
   295  		}
   296  	}
   297  }
   298  
   299  func (z *zeroSum) verify(d time.Duration) {
   300  	for {
   301  		time.Sleep(d)
   302  
   303  		// Grab the count of accounts from committed transactions first. The number
   304  		// of accounts found by the SELECT should be at least this number.
   305  		committedAccounts := uint64(z.accountsLen())
   306  
   307  		q := `SELECT count(*), sum(balance) FROM accounts`
   308  		var accounts uint64
   309  		var total int64
   310  		db := z.Nodes[z.RandNode(rand.Intn)].DB()
   311  		if err := db.QueryRow(q).Scan(&accounts, &total); err != nil {
   312  			z.maybeLogError(err)
   313  			continue
   314  		}
   315  		if total != 0 {
   316  			log.Fatalf(context.Background(), "unexpected total balance %d", total)
   317  		}
   318  		if accounts < committedAccounts {
   319  			log.Fatalf(context.Background(), "expected at least %d accounts, but found %d",
   320  				committedAccounts, accounts)
   321  		}
   322  	}
   323  }
   324  
   325  func (z *zeroSum) rangeInfo() (int, []int) {
   326  	replicas := make([]int, len(z.Nodes))
   327  	db, err := z.NewDB(context.Background(), z.RandNode(rand.Intn))
   328  	if err != nil {
   329  		z.maybeLogError(err)
   330  		return -1, replicas
   331  	}
   332  	rows, err := db.Query(`SELECT array_length(replicas, 1) FROM crdb_internal.ranges`)
   333  	if err != nil {
   334  		z.maybeLogError(err)
   335  		return -1, replicas
   336  	}
   337  	defer rows.Close()
   338  
   339  	var count int
   340  	for rows.Next() {
   341  		var numReplicas int
   342  		if err := rows.Scan(&numReplicas); err != nil {
   343  			z.maybeLogError(err)
   344  			return -1, replicas
   345  		}
   346  		for i := 0; i < numReplicas; i++ {
   347  			replicas[i]++
   348  		}
   349  		count++
   350  	}
   351  
   352  	return count, replicas
   353  }
   354  
   355  func (z *zeroSum) rangeStats(d time.Duration) {
   356  	for {
   357  		count, replicas := z.rangeInfo()
   358  		z.ranges.Lock()
   359  		z.ranges.count, z.ranges.replicas = count, replicas
   360  		z.ranges.Unlock()
   361  
   362  		time.Sleep(d)
   363  	}
   364  }
   365  
   366  func (z *zeroSum) formatReplicas(replicas []int) string {
   367  	var buf bytes.Buffer
   368  	for i := range replicas {
   369  		if i > 0 {
   370  			_, _ = buf.WriteString(" ")
   371  		}
   372  		fmt.Fprintf(&buf, "%d", replicas[i])
   373  		if !z.Nodes[i].Alive() {
   374  			_, _ = buf.WriteString("*")
   375  		}
   376  	}
   377  	return buf.String()
   378  }
   379  
   380  func (z *zeroSum) monitor(d time.Duration) {
   381  	start := timeutil.Now()
   382  	lastTime := start
   383  	var lastOps uint64
   384  
   385  	for ticks := 0; true; ticks++ {
   386  		time.Sleep(d)
   387  
   388  		if ticks%20 == 0 {
   389  			fmt.Printf("_elapsed__accounts_________ops__ops/sec___errors___splits____xfers___ranges_____________replicas\n")
   390  		}
   391  
   392  		now := timeutil.Now()
   393  		elapsed := now.Sub(lastTime).Seconds()
   394  		ops := atomic.LoadUint64(&z.stats.ops)
   395  
   396  		z.ranges.Lock()
   397  		ranges, replicas := z.ranges.count, z.ranges.replicas
   398  		z.ranges.Unlock()
   399  
   400  		fmt.Printf("%8s %9d %11d %8.1f %8d %8d %8d %8d %20s\n",
   401  			time.Duration(now.Sub(start).Seconds()+0.5)*time.Second,
   402  			z.accountsLen(), ops, float64(ops-lastOps)/elapsed,
   403  			atomic.LoadUint64(&z.stats.errors),
   404  			atomic.LoadUint64(&z.stats.splits),
   405  			atomic.LoadUint64(&z.stats.transfers),
   406  			ranges, z.formatReplicas(replicas))
   407  		lastTime = now
   408  		lastOps = ops
   409  	}
   410  }
   411  
   412  func main() {
   413  	flag.Parse()
   414  
   415  	cockroachBin := func() string {
   416  		bin := "./cockroach"
   417  		if _, err := os.Stat(bin); os.IsNotExist(err) {
   418  			bin = "cockroach"
   419  		} else if err != nil {
   420  			panic(err)
   421  		}
   422  		return bin
   423  	}()
   424  
   425  	perNodeCfg := localcluster.MakePerNodeFixedPortsCfg(*numNodes)
   426  
   427  	cfg := localcluster.ClusterConfig{
   428  		DataDir:     "cockroach-data-zerosum",
   429  		Binary:      cockroachBin,
   430  		NumNodes:    *numNodes,
   431  		NumWorkers:  *workers,
   432  		AllNodeArgs: flag.Args(),
   433  		DB:          "zerosum",
   434  		PerNodeCfg:  perNodeCfg,
   435  	}
   436  
   437  	c := &localcluster.LocalCluster{Cluster: localcluster.New(cfg)}
   438  	defer c.Close()
   439  
   440  	log.SetExitFunc(false /* hideStack */, func(code int) {
   441  		c.Close()
   442  		os.Exit(code)
   443  	})
   444  
   445  	signalCh := make(chan os.Signal, 1)
   446  	signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
   447  
   448  	go func() {
   449  		s := <-signalCh
   450  		log.Infof(context.Background(), "signal received: %v", s)
   451  		c.Close()
   452  		os.Exit(1)
   453  	}()
   454  
   455  	c.Start(context.Background())
   456  
   457  	z := newZeroSum(c, *numAccounts, *chaosType)
   458  	z.run(*workers, *monkeys)
   459  }