github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/allocator.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	gosql "database/sql"
    16  	"fmt"
    17  	"math"
    18  	"strings"
    19  	"time"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    22  	"github.com/cockroachdb/errors"
    23  )
    24  
    25  func registerAllocator(r *testRegistry) {
    26  	runAllocator := func(ctx context.Context, t *test, c *cluster, start int, maxStdDev float64) {
    27  		const fixturePath = `gs://cockroach-fixtures/workload/tpch/scalefactor=10/backup`
    28  		c.Put(ctx, cockroach, "./cockroach")
    29  		c.Put(ctx, workload, "./workload")
    30  
    31  		// Start the first `start` nodes and restore the fixture
    32  		args := startArgs("--args=--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5")
    33  		c.Start(ctx, t, c.Range(1, start), args)
    34  		db := c.Conn(ctx, 1)
    35  		defer db.Close()
    36  
    37  		m := newMonitor(ctx, c, c.Range(1, start))
    38  		m.Go(func(ctx context.Context) error {
    39  			t.Status("loading fixture")
    40  			if _, err := db.Exec(`RESTORE DATABASE tpch FROM $1`, fixturePath); err != nil {
    41  				t.Fatal(err)
    42  			}
    43  			return nil
    44  		})
    45  		m.Wait()
    46  
    47  		// Start the remaining nodes to kick off upreplication/rebalancing.
    48  		c.Start(ctx, t, c.Range(start+1, c.spec.NodeCount), args)
    49  
    50  		c.Run(ctx, c.Node(1), `./workload init kv --drop`)
    51  		for node := 1; node <= c.spec.NodeCount; node++ {
    52  			node := node
    53  			// TODO(dan): Ideally, the test would fail if this queryload failed,
    54  			// but we can't put it in monitor as-is because the test deadlocks.
    55  			go func() {
    56  				const cmd = `./workload run kv --tolerate-errors --min-block-bytes=8 --max-block-bytes=127`
    57  				l, err := t.l.ChildLogger(fmt.Sprintf(`kv-%d`, node))
    58  				if err != nil {
    59  					t.Fatal(err)
    60  				}
    61  				defer l.close()
    62  				_ = execCmd(ctx, t.l, roachprod, "ssh", c.makeNodes(c.Node(node)), "--", cmd)
    63  			}()
    64  		}
    65  
    66  		m = newMonitor(ctx, c, c.All())
    67  		m.Go(func(ctx context.Context) error {
    68  			t.Status("waiting for reblance")
    69  			return waitForRebalance(ctx, t.l, db, maxStdDev)
    70  		})
    71  		m.Wait()
    72  	}
    73  
    74  	r.Add(testSpec{
    75  		Name:    `replicate/up/1to3`,
    76  		Owner:   OwnerKV,
    77  		Cluster: makeClusterSpec(3),
    78  		Run: func(ctx context.Context, t *test, c *cluster) {
    79  			runAllocator(ctx, t, c, 1, 10.0)
    80  		},
    81  	})
    82  	r.Add(testSpec{
    83  		Name:    `replicate/rebalance/3to5`,
    84  		Owner:   OwnerKV,
    85  		Cluster: makeClusterSpec(5),
    86  		Run: func(ctx context.Context, t *test, c *cluster) {
    87  			runAllocator(ctx, t, c, 3, 42.0)
    88  		},
    89  	})
    90  	r.Add(testSpec{
    91  		Name:    `replicate/wide`,
    92  		Owner:   OwnerKV,
    93  		Timeout: 10 * time.Minute,
    94  		Cluster: makeClusterSpec(9, cpu(1)),
    95  		Run:     runWideReplication,
    96  	})
    97  }
    98  
    99  // printRebalanceStats prints the time it took for rebalancing to finish and the
   100  // final standard deviation of replica counts across stores.
   101  func printRebalanceStats(l *logger, db *gosql.DB) error {
   102  	// TODO(cuongdo): Output these in a machine-friendly way and graph.
   103  
   104  	// Output time it took to rebalance.
   105  	{
   106  		var rebalanceIntervalStr string
   107  		if err := db.QueryRow(
   108  			`SELECT (SELECT max(timestamp) FROM system.rangelog) - `+
   109  				`(SELECT max(timestamp) FROM system.eventlog WHERE "eventType"=$1)`,
   110  			`node_join`, /* sql.EventLogNodeJoin */
   111  		).Scan(&rebalanceIntervalStr); err != nil {
   112  			return err
   113  		}
   114  		l.Printf("cluster took %s to rebalance\n", rebalanceIntervalStr)
   115  	}
   116  
   117  	// Output # of range events that occurred. All other things being equal,
   118  	// larger numbers are worse and potentially indicate thrashing.
   119  	{
   120  		var rangeEvents int64
   121  		q := `SELECT count(*) from system.rangelog`
   122  		if err := db.QueryRow(q).Scan(&rangeEvents); err != nil {
   123  			return err
   124  		}
   125  		l.Printf("%d range events\n", rangeEvents)
   126  	}
   127  
   128  	// Output standard deviation of the replica counts for all stores.
   129  	{
   130  		var stdDev float64
   131  		if err := db.QueryRow(
   132  			`SELECT stddev(range_count) FROM crdb_internal.kv_store_status`,
   133  		).Scan(&stdDev); err != nil {
   134  			return err
   135  		}
   136  		l.Printf("stdDev(replica count) = %.2f\n", stdDev)
   137  	}
   138  
   139  	// Output the number of ranges on each store.
   140  	{
   141  		rows, err := db.Query(`SELECT store_id, range_count FROM crdb_internal.kv_store_status`)
   142  		if err != nil {
   143  			return err
   144  		}
   145  		defer rows.Close()
   146  		for rows.Next() {
   147  			var storeID, rangeCount int64
   148  			if err := rows.Scan(&storeID, &rangeCount); err != nil {
   149  				return err
   150  			}
   151  			l.Printf("s%d has %d ranges\n", storeID, rangeCount)
   152  		}
   153  	}
   154  
   155  	return nil
   156  }
   157  
   158  type replicationStats struct {
   159  	SecondsSinceLastEvent int64
   160  	EventType             string
   161  	RangeID               int64
   162  	StoreID               int64
   163  	ReplicaCountStdDev    float64
   164  }
   165  
   166  func (s replicationStats) String() string {
   167  	return fmt.Sprintf("last range event: %s for range %d/store %d (%ds ago)",
   168  		s.EventType, s.RangeID, s.StoreID, s.SecondsSinceLastEvent)
   169  }
   170  
   171  // allocatorStats returns the duration of stability (i.e. no replication
   172  // changes) and the standard deviation in replica counts. Only unrecoverable
   173  // errors are returned.
   174  func allocatorStats(db *gosql.DB) (s replicationStats, err error) {
   175  	defer func() {
   176  		if err != nil {
   177  			s.ReplicaCountStdDev = math.MaxFloat64
   178  		}
   179  	}()
   180  
   181  	// NB: These are the storage.RangeLogEventType enum, but it's intentionally
   182  	// not used to avoid pulling in the dep.
   183  	eventTypes := []interface{}{
   184  		`split`, `add`, `remove`,
   185  	}
   186  
   187  	q := `SELECT extract_duration(seconds FROM now()-timestamp), "rangeID", "storeID", "eventType"` +
   188  		`FROM system.rangelog WHERE "eventType" IN ($1, $2, $3) ORDER BY timestamp DESC LIMIT 1`
   189  
   190  	row := db.QueryRow(q, eventTypes...)
   191  	if row == nil {
   192  		// This should never happen, because the archived store we're starting with
   193  		// will always have some range events.
   194  		return replicationStats{}, errors.New("couldn't find any range events")
   195  	}
   196  	if err := row.Scan(&s.SecondsSinceLastEvent, &s.RangeID, &s.StoreID, &s.EventType); err != nil {
   197  		return replicationStats{}, err
   198  	}
   199  
   200  	if err := db.QueryRow(
   201  		`SELECT stddev(range_count) FROM crdb_internal.kv_store_status`,
   202  	).Scan(&s.ReplicaCountStdDev); err != nil {
   203  		return replicationStats{}, err
   204  	}
   205  
   206  	return s, nil
   207  }
   208  
   209  // waitForRebalance waits until there's been no recent range adds, removes, and
   210  // splits. We wait until the cluster is balanced or until `StableInterval`
   211  // elapses, whichever comes first. Then, it prints stats about the rebalancing
   212  // process. If the replica count appears unbalanced, an error is returned.
   213  //
   214  // This method is crude but necessary. If we were to wait until range counts
   215  // were just about even, we'd miss potential post-rebalance thrashing.
   216  func waitForRebalance(ctx context.Context, l *logger, db *gosql.DB, maxStdDev float64) error {
   217  	// const statsInterval = 20 * time.Second
   218  	const statsInterval = 2 * time.Second
   219  	const stableSeconds = 3 * 60
   220  
   221  	var statsTimer timeutil.Timer
   222  	defer statsTimer.Stop()
   223  	statsTimer.Reset(statsInterval)
   224  	for {
   225  		select {
   226  		case <-ctx.Done():
   227  			return ctx.Err()
   228  		case <-statsTimer.C:
   229  			statsTimer.Read = true
   230  			stats, err := allocatorStats(db)
   231  			if err != nil {
   232  				return err
   233  			}
   234  
   235  			l.Printf("%v\n", stats)
   236  			if stableSeconds <= stats.SecondsSinceLastEvent {
   237  				l.Printf("replica count stddev = %f, max allowed stddev = %f\n", stats.ReplicaCountStdDev, maxStdDev)
   238  				if stats.ReplicaCountStdDev > maxStdDev {
   239  					_ = printRebalanceStats(l, db)
   240  					return errors.Errorf(
   241  						"%ds elapsed without changes, but replica count standard "+
   242  							"deviation is %.2f (>%.2f)", stats.SecondsSinceLastEvent,
   243  						stats.ReplicaCountStdDev, maxStdDev)
   244  				}
   245  				return printRebalanceStats(l, db)
   246  			}
   247  			statsTimer.Reset(statsInterval)
   248  		}
   249  	}
   250  }
   251  
   252  func runWideReplication(ctx context.Context, t *test, c *cluster) {
   253  	nodes := c.spec.NodeCount
   254  	if nodes != 9 {
   255  		t.Fatalf("9-node cluster required")
   256  	}
   257  
   258  	args := startArgs("--env=COCKROACH_SCAN_MAX_IDLE_TIME=5ms")
   259  	c.Put(ctx, cockroach, "./cockroach")
   260  	c.Start(ctx, t, c.All(), args)
   261  
   262  	db := c.Conn(ctx, 1)
   263  	defer db.Close()
   264  
   265  	zones := func() []string {
   266  		oldVersion := false
   267  		rows, err := db.Query(`SELECT target FROM crdb_internal.zones`)
   268  		// TODO(solon): Remove this block once we are no longer running roachtest
   269  		// against version 19.1 and earlier.
   270  		if err != nil && strings.Contains(err.Error(), `column "target" does not exist`) {
   271  			oldVersion = true
   272  			rows, err = db.Query(`SELECT zone_name FROM crdb_internal.zones`)
   273  		}
   274  		if err != nil {
   275  			t.Fatal(err)
   276  		}
   277  		defer rows.Close()
   278  		var results []string
   279  		for rows.Next() {
   280  			var name string
   281  			if err := rows.Scan(&name); err != nil {
   282  				t.Fatal(err)
   283  			}
   284  			// TODO(solon): Remove this block once we are no longer running roachtest
   285  			// against version 19.1 and earlier.
   286  			if oldVersion {
   287  				which := "RANGE"
   288  				if name[0] == '.' {
   289  					name = name[1:]
   290  				} else if strings.Count(name, ".") == 0 {
   291  					which = "DATABASE"
   292  				} else {
   293  					which = "TABLE"
   294  				}
   295  				name = fmt.Sprintf("%s %s", which, name)
   296  			}
   297  			results = append(results, name)
   298  		}
   299  		return results
   300  	}
   301  
   302  	run := func(stmt string) {
   303  		t.l.Printf("%s\n", stmt)
   304  		if _, err := db.ExecContext(ctx, stmt); err != nil {
   305  			t.Fatal(err)
   306  		}
   307  	}
   308  
   309  	setReplication := func(width int) {
   310  		// Change every zone to have the same number of replicas as the number of
   311  		// nodes in the cluster.
   312  		for _, zone := range zones() {
   313  			run(fmt.Sprintf(`ALTER %s CONFIGURE ZONE USING num_replicas = %d`, zone, width))
   314  		}
   315  	}
   316  	setReplication(nodes)
   317  
   318  	countMisreplicated := func(width int) int {
   319  		var count int
   320  		if err := db.QueryRow(
   321  			"SELECT count(*) FROM crdb_internal.ranges WHERE array_length(replicas,1) != $1",
   322  			width,
   323  		).Scan(&count); err != nil {
   324  			t.Fatal(err)
   325  		}
   326  		return count
   327  	}
   328  
   329  	waitForReplication := func(width int) {
   330  		for count := -1; count != 0; time.Sleep(time.Second) {
   331  			count = countMisreplicated(width)
   332  			t.l.Printf("%d mis-replicated ranges\n", count)
   333  		}
   334  	}
   335  
   336  	waitForReplication(nodes)
   337  
   338  	numRanges := func() int {
   339  		var count int
   340  		if err := db.QueryRow(`SELECT count(*) FROM crdb_internal.ranges`).Scan(&count); err != nil {
   341  			t.Fatal(err)
   342  		}
   343  		return count
   344  	}()
   345  
   346  	// Stop the cluster and restart 2/3 of the nodes.
   347  	c.Stop(ctx)
   348  	c.Start(ctx, t, c.Range(1, 6), args)
   349  
   350  	waitForUnderReplicated := func(count int) {
   351  		for start := timeutil.Now(); ; time.Sleep(time.Second) {
   352  			query := `
   353  SELECT sum((metrics->>'ranges.unavailable')::DECIMAL)::INT AS ranges_unavailable,
   354         sum((metrics->>'ranges.underreplicated')::DECIMAL)::INT AS ranges_underreplicated
   355  FROM crdb_internal.kv_store_status
   356  `
   357  			var unavailable, underReplicated int
   358  			if err := db.QueryRow(query).Scan(&unavailable, &underReplicated); err != nil {
   359  				t.Fatal(err)
   360  			}
   361  			t.l.Printf("%d unavailable, %d under-replicated ranges\n", unavailable, underReplicated)
   362  			if unavailable != 0 {
   363  				// A freshly started cluster might show unavailable ranges for a brief
   364  				// period of time due to the way that metric is calculated. Only
   365  				// complain about unavailable ranges if they persist for too long.
   366  				if timeutil.Since(start) >= 30*time.Second {
   367  					t.Fatalf("%d unavailable ranges", unavailable)
   368  				}
   369  				continue
   370  			}
   371  			if underReplicated >= count {
   372  				break
   373  			}
   374  		}
   375  	}
   376  
   377  	waitForUnderReplicated(numRanges)
   378  	if n := countMisreplicated(9); n != 0 {
   379  		t.Fatalf("expected 0 mis-replicated ranges, but found %d", n)
   380  	}
   381  
   382  	decom := func(id int) {
   383  		c.Run(ctx, c.Node(1),
   384  			fmt.Sprintf("./cockroach node decommission --insecure --wait=none %d", id))
   385  	}
   386  
   387  	// Decommission a node. The ranges should down-replicate to 7 replicas.
   388  	decom(9)
   389  	waitForReplication(7)
   390  
   391  	// Set the replication width to 5. The replicas should down-replicate, though
   392  	// this currently requires the time-until-store-dead threshold to pass
   393  	// because the allocator cannot select a replica for removal that is on a
   394  	// store for which it doesn't have a store descriptor.
   395  	run(`SET CLUSTER SETTING server.time_until_store_dead = '90s'`)
   396  	setReplication(5)
   397  	waitForReplication(5)
   398  
   399  	// Restart the down nodes to prevent the dead node detector from complaining.
   400  	c.Start(ctx, t, c.Range(7, 9))
   401  }