github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/restart.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    19  )
    20  
    21  func runRestart(ctx context.Context, t *test, c *cluster, downDuration time.Duration) {
    22  	crdbNodes := c.Range(1, c.spec.NodeCount)
    23  	workloadNode := c.Node(1)
    24  	const restartNode = 3
    25  
    26  	t.Status("installing cockroach")
    27  	c.Put(ctx, cockroach, "./cockroach", crdbNodes)
    28  	c.Start(ctx, t, crdbNodes, startArgs(`--args=--vmodule=raft_log_queue=3`))
    29  
    30  	// We don't really need tpcc, we just need a good amount of traffic and a good
    31  	// amount of data.
    32  	t.Status("importing tpcc fixture")
    33  	c.Run(ctx, workloadNode,
    34  		"./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false")
    35  
    36  	// Wait a full scanner cycle (10m) for the raft log queue to truncate the
    37  	// sstable entries from the import. They're huge and are not representative of
    38  	// normal traffic.
    39  	//
    40  	// NB: less would probably do a good enough job, but let's play it safe.
    41  	//
    42  	// TODO(dan/tbg): It's awkward that this is necessary. We should be able to
    43  	// do a better job here, for example by truncating only a smaller prefix of
    44  	// the log instead of all of it (right now there's no notion of per-entry
    45  	// size when we do truncate). Also having quiescing ranges truncate to
    46  	// lastIndex will be helpful because that drives the log size down eagerly
    47  	// when things are healthy.
    48  	t.Status("waiting for addsstable truncations")
    49  	time.Sleep(11 * time.Minute)
    50  
    51  	// Stop a node.
    52  	c.Stop(ctx, c.Node(restartNode))
    53  
    54  	// Wait for between 10s and `server.time_until_store_dead` while sending
    55  	// traffic to one of the nodes that are not down. This used to cause lots of
    56  	// raft log truncation, which caused node 3 to need lots of snapshots when it
    57  	// came back up.
    58  	c.Run(ctx, workloadNode, "./cockroach workload run tpcc --warehouses=100 "+
    59  		fmt.Sprintf("--tolerate-errors --wait=false --duration=%s", downDuration))
    60  
    61  	// Bring it back up and make sure it can serve a query within a reasonable
    62  	// time limit. For now, less time than it was down for.
    63  	c.Start(ctx, t, c.Node(restartNode))
    64  
    65  	// Dialing the formerly down node may still be prevented by the circuit breaker
    66  	// for a short moment (seconds) after n3 restarts. If it happens, the COUNT(*)
    67  	// can fail with a "no inbound stream connection" error. This is not what we
    68  	// want to catch in this test, so work around it.
    69  	//
    70  	// See https://github.com/cockroachdb/cockroach/issues/38602.
    71  	time.Sleep(15 * time.Second)
    72  
    73  	start := timeutil.Now()
    74  	restartNodeDB := c.Conn(ctx, restartNode)
    75  	if _, err := restartNodeDB.Exec(`SELECT count(*) FROM tpcc.order_line`); err != nil {
    76  		t.Fatal(err)
    77  	}
    78  	if took := timeutil.Since(start); took > downDuration {
    79  		t.Fatalf(`expected to recover within %s took %s`, downDuration, took)
    80  	} else {
    81  		c.l.Printf(`connecting and query finished in %s`, took)
    82  	}
    83  }
    84  
    85  func registerRestart(r *testRegistry) {
    86  	r.Add(testSpec{
    87  		Name:    fmt.Sprintf("restart/down-for-2m"),
    88  		Owner:   OwnerKV,
    89  		Cluster: makeClusterSpec(3),
    90  		// "cockroach workload is only in 19.1+"
    91  		MinVersion: "v19.1.0",
    92  		Run: func(ctx context.Context, t *test, c *cluster) {
    93  			runRestart(ctx, t, c, 2*time.Minute)
    94  		},
    95  	})
    96  }