github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/restart.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 "fmt" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 19 ) 20 21 func runRestart(ctx context.Context, t *test, c *cluster, downDuration time.Duration) { 22 crdbNodes := c.Range(1, c.spec.NodeCount) 23 workloadNode := c.Node(1) 24 const restartNode = 3 25 26 t.Status("installing cockroach") 27 c.Put(ctx, cockroach, "./cockroach", crdbNodes) 28 c.Start(ctx, t, crdbNodes, startArgs(`--args=--vmodule=raft_log_queue=3`)) 29 30 // We don't really need tpcc, we just need a good amount of traffic and a good 31 // amount of data. 32 t.Status("importing tpcc fixture") 33 c.Run(ctx, workloadNode, 34 "./cockroach workload fixtures import tpcc --warehouses=100 --fks=false --checks=false") 35 36 // Wait a full scanner cycle (10m) for the raft log queue to truncate the 37 // sstable entries from the import. They're huge and are not representative of 38 // normal traffic. 39 // 40 // NB: less would probably do a good enough job, but let's play it safe. 41 // 42 // TODO(dan/tbg): It's awkward that this is necessary. We should be able to 43 // do a better job here, for example by truncating only a smaller prefix of 44 // the log instead of all of it (right now there's no notion of per-entry 45 // size when we do truncate). Also having quiescing ranges truncate to 46 // lastIndex will be helpful because that drives the log size down eagerly 47 // when things are healthy. 48 t.Status("waiting for addsstable truncations") 49 time.Sleep(11 * time.Minute) 50 51 // Stop a node. 52 c.Stop(ctx, c.Node(restartNode)) 53 54 // Wait for between 10s and `server.time_until_store_dead` while sending 55 // traffic to one of the nodes that are not down. This used to cause lots of 56 // raft log truncation, which caused node 3 to need lots of snapshots when it 57 // came back up. 58 c.Run(ctx, workloadNode, "./cockroach workload run tpcc --warehouses=100 "+ 59 fmt.Sprintf("--tolerate-errors --wait=false --duration=%s", downDuration)) 60 61 // Bring it back up and make sure it can serve a query within a reasonable 62 // time limit. For now, less time than it was down for. 63 c.Start(ctx, t, c.Node(restartNode)) 64 65 // Dialing the formerly down node may still be prevented by the circuit breaker 66 // for a short moment (seconds) after n3 restarts. If it happens, the COUNT(*) 67 // can fail with a "no inbound stream connection" error. This is not what we 68 // want to catch in this test, so work around it. 69 // 70 // See https://github.com/cockroachdb/cockroach/issues/38602. 71 time.Sleep(15 * time.Second) 72 73 start := timeutil.Now() 74 restartNodeDB := c.Conn(ctx, restartNode) 75 if _, err := restartNodeDB.Exec(`SELECT count(*) FROM tpcc.order_line`); err != nil { 76 t.Fatal(err) 77 } 78 if took := timeutil.Since(start); took > downDuration { 79 t.Fatalf(`expected to recover within %s took %s`, downDuration, took) 80 } else { 81 c.l.Printf(`connecting and query finished in %s`, took) 82 } 83 } 84 85 func registerRestart(r *testRegistry) { 86 r.Add(testSpec{ 87 Name: fmt.Sprintf("restart/down-for-2m"), 88 Owner: OwnerKV, 89 Cluster: makeClusterSpec(3), 90 // "cockroach workload is only in 19.1+" 91 MinVersion: "v19.1.0", 92 Run: func(ctx context.Context, t *test, c *cluster) { 93 runRestart(ctx, t, c, 2*time.Minute) 94 }, 95 }) 96 }