github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/quit.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/quit.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"encoding/json"
    16  	"fmt"
    17  	"strings"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/testutils"
    21  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    22  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    23  	"github.com/cockroachdb/errors"
    24  	"github.com/kr/pretty"
    25  )
    26  
    27  type quitTest struct {
    28  	t    *test
    29  	c    *cluster
    30  	args option
    31  }
    32  
    33  // runQuitTransfersLeases performs rolling restarts on a
    34  // 3-node cluster and ascertains that each node shutting down
    35  // transfers all its leases reliably to other nodes prior to
    36  // terminating.
    37  func runQuitTransfersLeases(
    38  	ctx context.Context,
    39  	t *test,
    40  	c *cluster,
    41  	methodName string,
    42  	method func(ctx context.Context, t *test, c *cluster, nodeID int),
    43  ) {
    44  	q := quitTest{t: t, c: c}
    45  	q.init(ctx)
    46  	q.runTest(ctx, method)
    47  }
    48  
    49  func (q *quitTest) init(ctx context.Context) {
    50  	q.args = startArgs(
    51  		"--env=COCKROACH_SCAN_MAX_IDLE_TIME=5ms",               // iterate fast for rebalancing
    52  		"-a", "--vmodule=store=1,replica=1,replica_proposal=1", // verbosity to troubleshoot drains
    53  	)
    54  	q.c.Put(ctx, cockroach, "./cockroach")
    55  	q.c.Start(ctx, q.t, q.args)
    56  }
    57  
    58  func (q *quitTest) Fatal(args ...interface{}) {
    59  	q.t.Fatal(args...)
    60  }
    61  
    62  func (q *quitTest) Fatalf(format string, args ...interface{}) {
    63  	q.t.Fatalf(format, args...)
    64  }
    65  
    66  func (q *quitTest) runTest(
    67  	ctx context.Context, method func(ctx context.Context, t *test, c *cluster, nodeID int),
    68  ) {
    69  	q.waitForUpReplication(ctx)
    70  	q.createRanges(ctx)
    71  	q.setupIncrementalDrain(ctx)
    72  
    73  	// runTest iterates through the cluster two times and restarts each
    74  	// node in turn. After each node shutdown it verifies that there are
    75  	// no leases held by the down node. (See the comments inside
    76  	// checkNoLeases() for details.)
    77  	//
    78  	// The shutdown method is passed in via the 'method' parameter, used
    79  	// below.
    80  	q.t.l.Printf("now running restart loop\n")
    81  	for i := 0; i < 3; i++ {
    82  		q.t.l.Printf("iteration %d\n", i)
    83  		for nodeID := 1; nodeID <= q.c.spec.NodeCount; nodeID++ {
    84  			q.t.l.Printf("stopping node %d\n", nodeID)
    85  			q.runWithTimeout(ctx, func(ctx context.Context) { method(ctx, q.t, q.c, nodeID) })
    86  			q.runWithTimeout(ctx, func(ctx context.Context) { q.checkNoLeases(ctx, nodeID) })
    87  			q.t.l.Printf("restarting node %d\n", nodeID)
    88  			q.runWithTimeout(ctx, func(ctx context.Context) { q.restartNode(ctx, nodeID) })
    89  		}
    90  	}
    91  }
    92  
    93  // restartNode restarts one node and waits until it's up and ready to
    94  // accept clients.
    95  func (q *quitTest) restartNode(ctx context.Context, nodeID int) {
    96  	q.c.Start(ctx, q.t, q.args, q.c.Node(nodeID))
    97  
    98  	q.t.l.Printf("waiting for readiness of node %d\n", nodeID)
    99  	// Now perform a SQL query. This achieves two goals:
   100  	// - it waits until the server is ready.
   101  	// - the particular query forces a cluster-wide RPC; which
   102  	//   forces any circuit breaker to trip and re-establish
   103  	//   the RPC connection if needed.
   104  	db := q.c.Conn(ctx, nodeID)
   105  	defer db.Close()
   106  	if _, err := db.ExecContext(ctx, `TABLE crdb_internal.cluster_sessions`); err != nil {
   107  		q.Fatal(err)
   108  	}
   109  }
   110  
   111  func (q *quitTest) waitForUpReplication(ctx context.Context) {
   112  	db := q.c.Conn(ctx, 1)
   113  	defer db.Close()
   114  
   115  	// We'll want rebalancing to be a bit faster than normal, so
   116  	// that the up-replication does not take ages.
   117  	if _, err := db.ExecContext(ctx, `SET CLUSTER SETTING	kv.snapshot_rebalance.max_rate = '128MiB'`); err != nil {
   118  		q.Fatal(err)
   119  	}
   120  
   121  	err := retry.ForDuration(30*time.Second, func() error {
   122  		q.t.l.Printf("waiting for up-replication\n")
   123  		row := db.QueryRowContext(ctx, `SELECT min(array_length(replicas, 1)) FROM crdb_internal.ranges_no_leases`)
   124  		minReplicas := 0
   125  		if err := row.Scan(&minReplicas); err != nil {
   126  			q.Fatal(err)
   127  		}
   128  		if minReplicas < 3 {
   129  			time.Sleep(time.Second)
   130  			return errors.Newf("some ranges not up-replicated yet")
   131  		}
   132  		return nil
   133  	})
   134  	if err != nil {
   135  		q.Fatalf("cluster did not up-replicate: %v", err)
   136  	}
   137  }
   138  
   139  // runWithTimeout runs a command with a 1-minute timeout.
   140  func (q *quitTest) runWithTimeout(ctx context.Context, fn func(ctx context.Context)) {
   141  	if err := contextutil.RunWithTimeout(ctx, "do", time.Minute, func(ctx context.Context) error {
   142  		fn(ctx)
   143  		return nil
   144  	}); err != nil {
   145  		q.Fatal(err)
   146  	}
   147  }
   148  
   149  // setupIncrementalDrain simulate requiring more than one Drain round
   150  // to transfer all leases. This way, we exercise the iterating code in
   151  // quit/node drain.
   152  func (q *quitTest) setupIncrementalDrain(ctx context.Context) {
   153  	db := q.c.Conn(ctx, 1)
   154  	defer db.Close()
   155  	if _, err := db.ExecContext(ctx, `
   156  SET CLUSTER SETTING server.shutdown.lease_transfer_wait = '10ms'`); err != nil {
   157  		if strings.Contains(err.Error(), "unknown cluster setting") {
   158  			// old version; ok
   159  		} else {
   160  			q.Fatal(err)
   161  		}
   162  	}
   163  }
   164  
   165  // createRanges creates a bunch of ranges on the test cluster.
   166  func (q *quitTest) createRanges(ctx context.Context) {
   167  	const numRanges = 500
   168  
   169  	db := q.c.Conn(ctx, 1)
   170  	defer db.Close()
   171  	if _, err := db.ExecContext(ctx, fmt.Sprintf(`
   172  CREATE TABLE t(x, y, PRIMARY KEY(x)) AS SELECT @1, 1 FROM generate_series(1,%[1]d)`,
   173  		numRanges)); err != nil {
   174  		q.Fatal(err)
   175  	}
   176  	// We split them from right-to-left so we're peeling at most 1
   177  	// row each time on the right.
   178  	//
   179  	// Also we do it a hundred at a time, so as to be able to see the
   180  	// progress when watching the roachtest progress interactively.
   181  	for i := numRanges; i > 1; i -= 100 {
   182  		q.t.l.Printf("creating %d ranges (%d-%d)...\n", numRanges, i, i-99)
   183  		if _, err := db.ExecContext(ctx, fmt.Sprintf(`
   184  ALTER TABLE t SPLIT AT TABLE generate_series(%[1]d,%[1]d-99,-1)`, i)); err != nil {
   185  			q.Fatal(err)
   186  		}
   187  	}
   188  }
   189  
   190  // checkNoLeases verifies that no range has a lease on the node
   191  // that's just been shut down.
   192  func (q *quitTest) checkNoLeases(ctx context.Context, nodeID int) {
   193  	// We need to use SQL against a node that's not the one we're
   194  	// shutting down.
   195  	otherNodeID := 1 + nodeID%q.c.spec.NodeCount
   196  
   197  	// Now we're going to check two things:
   198  	//
   199  	// 1) *immediately*, that every range in the cluster has a lease
   200  	//    some other place than nodeID.
   201  	//
   202  	//    Note that for with this condition, it is possible that _some_
   203  	//    replica of any given range think that the leaseholder is
   204  	//    nodeID, even though _another_ replica has become leaseholder
   205  	//    already. That's because followers can lag behind and
   206  	//    drain does not wait for followers to catch up.
   207  	//    https://github.com/cockroachdb/cockroach/issues/47100
   208  	//
   209  	// 2) *eventually* that every other node than nodeID has no range
   210  	//    replica whose lease refers to nodeID, i.e. the followers
   211  	//    have all caught up.
   212  	//    Note: when issue #47100 is fixed, this 2nd condition
   213  	//    must be true immediately -- drain is then able to wait
   214  	//    for all followers to learn who the new leaseholder is.
   215  
   216  	if err := testutils.SucceedsSoonError(func() error {
   217  		// To achieve that, we ask first each range in turn for its range
   218  		// report.
   219  		//
   220  		// For condition (1) we accumulate all the known ranges in
   221  		// knownRanges, and assign them the node ID of their leaseholder
   222  		// whenever it is not nodeID. Then at the end we check that every
   223  		// entry in the map has a non-zero value.
   224  		knownRanges := map[string]int{}
   225  		//
   226  		// For condition (2) we accumulate the unwanted leases in
   227  		// invLeaseMap, then check at the end that the map is empty.
   228  		invLeaseMap := map[int][]string{}
   229  		for i := 1; i <= q.c.spec.NodeCount; i++ {
   230  			if i == nodeID {
   231  				// Can't request this node. Ignore.
   232  				continue
   233  			}
   234  
   235  			q.t.l.Printf("retrieving ranges for node %d\n", i)
   236  			// Get the report via HTTP.
   237  			// Flag -s is to remove progress on stderr, so that the buffer
   238  			// contains the JSON of the response and nothing else.
   239  			buf, err := q.c.RunWithBuffer(ctx, q.t.l, q.c.Node(otherNodeID),
   240  				"curl", "-s", fmt.Sprintf("http://%s/_status/ranges/%d",
   241  					q.c.InternalAdminUIAddr(ctx, q.c.Node(otherNodeID))[0], i))
   242  			if err != nil {
   243  				q.Fatal(err)
   244  			}
   245  			// We need just a subset of the response. Make an ad-hoc
   246  			// struct with just the bits of interest.
   247  			type jsonOutput struct {
   248  				Ranges []struct {
   249  					State struct {
   250  						State struct {
   251  							Desc struct {
   252  								RangeID string `json:"rangeId"`
   253  							} `json:"desc"`
   254  							Lease struct {
   255  								Replica struct {
   256  									NodeID int `json:"nodeId"`
   257  								} `json:"replica"`
   258  							} `json:"lease"`
   259  						} `json:"state"`
   260  					} `json:"state"`
   261  				} `json:"ranges"`
   262  			}
   263  			var details jsonOutput
   264  			if err := json.Unmarshal(buf, &details); err != nil {
   265  				q.Fatal(err)
   266  			}
   267  			// Some sanity check.
   268  			if len(details.Ranges) == 0 {
   269  				q.Fatal("expected some ranges from RPC, got none")
   270  			}
   271  			// Is there any range whose lease refers to nodeID?
   272  			var invalidLeases []string
   273  			for _, r := range details.Ranges {
   274  				// Some more sanity check.
   275  				if r.State.State.Lease.Replica.NodeID == 0 {
   276  					q.Fatalf("expected a valid lease state, got %# v", pretty.Formatter(r))
   277  				}
   278  				curLeaseHolder := knownRanges[r.State.State.Desc.RangeID]
   279  				if r.State.State.Lease.Replica.NodeID == nodeID {
   280  					// As per condition (2) above we want to know which ranges
   281  					// have an unexpected left over lease on nodeID.
   282  					invalidLeases = append(invalidLeases, r.State.State.Desc.RangeID)
   283  				} else {
   284  					// As per condition (1) above we track in knownRanges if there
   285  					// is at least one known other than nodeID that thinks that
   286  					// the lease has been transferred.
   287  					curLeaseHolder = r.State.State.Lease.Replica.NodeID
   288  				}
   289  				knownRanges[r.State.State.Desc.RangeID] = curLeaseHolder
   290  			}
   291  			if len(invalidLeases) > 0 {
   292  				invLeaseMap[i] = invalidLeases
   293  			}
   294  		}
   295  		// (1): is there a range with no replica outside of nodeID?
   296  		var leftOver []string
   297  		for r, n := range knownRanges {
   298  			if n == 0 {
   299  				leftOver = append(leftOver, r)
   300  			}
   301  		}
   302  		if len(leftOver) > 0 {
   303  			q.Fatalf("(1) ranges with no lease outside of node %d: %# v", nodeID, pretty.Formatter(leftOver))
   304  		}
   305  		// (2): is there a range with left over replicas on nodeID?
   306  		//
   307  		// TODO(knz): Eventually we want this condition to be always
   308  		// true, i.e. fail the test immediately if found to be false
   309  		// instead of waiting. (#47100)
   310  		if len(invLeaseMap) > 0 {
   311  			err := errors.Newf(
   312  				"(2) ranges with remaining leases on node %d, per node: %# v",
   313  				nodeID, pretty.Formatter(invLeaseMap))
   314  			q.t.l.Printf("condition failed: %v\n", err)
   315  			q.t.l.Printf("retrying until SucceedsSoon has enough...\n")
   316  			return err
   317  		}
   318  		return nil
   319  	}); err != nil {
   320  		q.Fatal(err)
   321  	}
   322  
   323  	db := q.c.Conn(ctx, otherNodeID)
   324  	defer db.Close()
   325  	// For good measure, also write to the table. This ensures it
   326  	// remains available.
   327  	if _, err := db.ExecContext(ctx, `UPDATE t SET y = y + 1`); err != nil {
   328  		q.Fatal(err)
   329  	}
   330  }
   331  
   332  func registerQuitTransfersLeases(r *testRegistry) {
   333  	registerTest := func(name, minver string, method func(context.Context, *test, *cluster, int)) {
   334  		r.Add(testSpec{
   335  			Name:       fmt.Sprintf("transfer-leases/%s", name),
   336  			Owner:      OwnerKV,
   337  			Cluster:    makeClusterSpec(3),
   338  			MinVersion: minver,
   339  			Run: func(ctx context.Context, t *test, c *cluster) {
   340  				runQuitTransfersLeases(ctx, t, c, name, method)
   341  			},
   342  		})
   343  	}
   344  
   345  	// Uses 'roachprod stop --sig 15 --wait', ie send SIGTERM and wait
   346  	// until the process exits.
   347  	registerTest("signal", "v19.2.0", func(ctx context.Context, t *test, c *cluster, nodeID int) {
   348  		c.Stop(ctx, c.Node(nodeID),
   349  			roachprodArgOption{"--sig", "15", "--wait"}, // graceful shutdown
   350  		)
   351  	})
   352  
   353  	// Uses 'cockroach quit' which should drain and then request a
   354  	// shutdown. It then waits for the process to self-exit.
   355  	registerTest("quit", "v19.2.0", func(ctx context.Context, t *test, c *cluster, nodeID int) {
   356  		_ = runQuit(ctx, t, c, nodeID)
   357  	})
   358  
   359  	// Uses 'cockroach drain', followed by a non-graceful process
   360  	// kill. If the drain is successful, the leases are transferred
   361  	// successfully even if if the process terminates non-gracefully.
   362  	registerTest("drain", "v20.1.0", func(ctx context.Context, t *test, c *cluster, nodeID int) {
   363  		buf, err := c.RunWithBuffer(ctx, t.l, c.Node(nodeID),
   364  			"./cockroach", "node", "drain", "--insecure", "--logtostderr=INFO",
   365  			fmt.Sprintf("--port={pgport:%d}", nodeID),
   366  		)
   367  		t.l.Printf("cockroach node drain:\n%s\n", buf)
   368  		if err != nil {
   369  			t.Fatal(err)
   370  		}
   371  		// Send first SIGHUP to the process to force it to flush its logs
   372  		// before terminating. Otherwise the SIGKILL below will truncate
   373  		// the log.
   374  		c.Stop(ctx, c.Node(nodeID),
   375  			roachprodArgOption{"--sig", "1"},
   376  		)
   377  		// We use SIGKILL to terminate nodes here. Of course, an operator
   378  		// should not do this and instead terminate with SIGTERM even
   379  		// after a complete graceful drain. However, what this test is
   380  		// asserting is that a graceful drain is *sufficient* to make
   381  		// everything look smooth from the perspective of other nodes,
   382  		// even if the node goes "kaput" after the drain.
   383  		//
   384  		// (This also ensures that the test exercises separate code; if we
   385  		// used SIGTERM here we'd be combining the graceful drain by 'node
   386  		// drain' with the graceful drain by the signal handler. If either
   387  		// becomes broken, the test wouldn't help identify which one needs
   388  		// attention.)
   389  		c.Stop(ctx, c.Node(nodeID),
   390  			roachprodArgOption{"--sig", "9", "--wait"})
   391  	})
   392  }
   393  
   394  func runQuit(ctx context.Context, t *test, c *cluster, nodeID int, extraArgs ...string) []byte {
   395  	args := append([]string{
   396  		"./cockroach", "quit", "--insecure", "--logtostderr=INFO",
   397  		fmt.Sprintf("--port={pgport:%d}", nodeID)},
   398  		extraArgs...)
   399  	buf, err := c.RunWithBuffer(ctx, t.l, c.Node(nodeID), args...)
   400  	t.l.Printf("cockroach quit:\n%s\n", buf)
   401  	if err != nil {
   402  		t.Fatal(err)
   403  	}
   404  	c.Stop(ctx, c.Node(nodeID),
   405  		roachprodArgOption{"--sig", "0", "--wait"}, // no shutdown, just wait for exit
   406  	)
   407  	return buf
   408  }
   409  
   410  func registerQuitAllNodes(r *testRegistry) {
   411  	// This test verifies that 'cockroach quit' can terminate all nodes
   412  	// in the cluster: normally as long as there's quorum, then with a
   413  	// short --drain-wait for the remaining nodes under quorum.
   414  	r.Add(testSpec{
   415  		Name:       "quit-all-nodes",
   416  		Owner:      OwnerKV,
   417  		Cluster:    makeClusterSpec(5),
   418  		MinVersion: "v20.1.0",
   419  		Run: func(ctx context.Context, t *test, c *cluster) {
   420  			q := quitTest{t: t, c: c}
   421  
   422  			// Start the cluster.
   423  			q.init(ctx)
   424  			// Wait for up-replication so that the cluster expects 1 ranges
   425  			// everywhere for system ranges.
   426  			q.waitForUpReplication(ctx)
   427  
   428  			// Shut one nodes down gracefully with a very long wait (longer
   429  			// than the test timeout). This is guaranteed to work - we still
   430  			// have quorum at that point.
   431  			q.runWithTimeout(ctx, func(ctx context.Context) { _ = runQuit(ctx, q.t, q.c, 5, "--drain-wait=1h") })
   432  
   433  			// Now shut down the remaining 4 nodes less gracefully, with a
   434  			// short wait.
   435  
   436  			// For the next two nodes, we may or may not observe that
   437  			// the graceful shutdown succeed. It may succeed if every
   438  			// range has enough quorum on the last 2 nodes (shut down later below).
   439  			// It may fail if some ranges have a quorum composed of n3, n4, n5.
   440  			// See: https://github.com/cockroachdb/cockroach/issues/48339
   441  			q.runWithTimeout(ctx, func(ctx context.Context) { _ = runQuit(ctx, q.t, q.c, 4, "--drain-wait=4s") })
   442  			q.runWithTimeout(ctx, func(ctx context.Context) { _ = runQuit(ctx, q.t, q.c, 3, "--drain-wait=4s") })
   443  
   444  			// For the lat two nodes, we are always under quorum. In this
   445  			// case we can expect `quit` to always report a hard shutdown
   446  			// was required.
   447  			q.runWithTimeout(ctx, func(ctx context.Context) { expectHardShutdown(ctx, q.t, runQuit(ctx, q.t, q.c, 2, "--drain-wait=4s")) })
   448  			q.runWithTimeout(ctx, func(ctx context.Context) { expectHardShutdown(ctx, q.t, runQuit(ctx, q.t, q.c, 1, "--drain-wait=4s")) })
   449  
   450  			// At the end, restart all nodes. We do this to check that
   451  			// the cluster can indeed restart, and also to please
   452  			// the dead node detection check at the end of each test.
   453  			q.c.Start(ctx, q.t, q.args)
   454  		},
   455  	})
   456  }
   457  
   458  // expectHardShutdown expects a "drain did not complete successfully" message.
   459  func expectHardShutdown(ctx context.Context, t *test, cmdOut []byte) {
   460  	if !strings.Contains(string(cmdOut), "drain did not complete successfully") {
   461  		t.Fatalf("expected 'drain did not complete successfully' in quit output, got:\n%s", cmdOut)
   462  	}
   463  }