github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/election.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    18  )
    19  
    20  func registerElectionAfterRestart(r *testRegistry) {
    21  	r.Add(testSpec{
    22  		Name:    "election-after-restart",
    23  		Owner:   OwnerKV,
    24  		Skip:    "https://github.com/cockroachdb/cockroach/issues/35047",
    25  		Cluster: makeClusterSpec(3),
    26  		Run: func(ctx context.Context, t *test, c *cluster) {
    27  			t.Status("starting up")
    28  			c.Put(ctx, cockroach, "./cockroach")
    29  			c.Start(ctx, t)
    30  
    31  			// If the initial ranges aren't fully replicated by the time we
    32  			// run our splits, replicating them after the splits will take
    33  			// longer, so wait for the initial replication before
    34  			// proceeding.
    35  			time.Sleep(3 * time.Second)
    36  
    37  			t.Status("creating table and splits")
    38  			c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
    39          CREATE DATABASE IF NOT EXISTS test;
    40          CREATE TABLE test.kv (k INT PRIMARY KEY, v INT);
    41          -- Prevent the merge queue from immediately discarding our splits.
    42          SET CLUSTER SETTING kv.range_merge.queue_enabled = false;
    43          ALTER TABLE test.kv SPLIT AT SELECT generate_series(0, 10000, 100)"`)
    44  
    45  			start := timeutil.Now()
    46  			c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
    47          SELECT * FROM test.kv"`)
    48  			duration := timeutil.Since(start)
    49  			t.l.Printf("pre-restart, query took %s\n", duration)
    50  
    51  			// If we restart before all the nodes have applied the splits,
    52  			// there will be a lot of snapshot attempts (which may fail)
    53  			// after the restart. This appears to slow down startup enough
    54  			// to fail the condition below, so wait a bit for the dust to
    55  			// settle before restarting.
    56  			time.Sleep(3 * time.Second)
    57  
    58  			t.Status("restarting")
    59  			c.Stop(ctx)
    60  			c.Start(ctx, t)
    61  
    62  			// Each of the 100 ranges in this table must elect a leader for
    63  			// this query to complete. In naive raft, each of these
    64  			// elections would require waiting for a 3-second timeout, one
    65  			// at a time. This test verifies that our mechanisms to speed
    66  			// this up are working (we trigger elections eagerly, but not so
    67  			// eagerly that multiple elections conflict with each other).
    68  			start = timeutil.Now()
    69  			// Use a large CONNECT_TIMEOUT so that if the initial connection
    70  			// takes ages (perhaps due to some cli-internal query taking a
    71  			// very long time), we fail with the duration check below and
    72  			// not an opaque error from the cli.
    73  			buf, err := c.RunWithBuffer(ctx, t.l, c.Node(1), `COCKROACH_CONNECT_TIMEOUT=240 ./cockroach sql --insecure -e "
    74  SET TRACING = on;
    75  SELECT * FROM test.kv;
    76  SET TRACING = off;
    77  SHOW TRACE FOR SESSION;
    78  "`)
    79  			if err != nil {
    80  				t.Fatalf("%s\n\n%s", buf, err)
    81  			}
    82  			duration = timeutil.Since(start)
    83  			t.l.Printf("post-restart, query took %s\n", duration)
    84  			if expected := 15 * time.Second; duration > expected {
    85  				// In the happy case, this query runs in around 250ms. Prior
    86  				// to the introduction of this test, a bug caused most
    87  				// elections to fail and the query would take over 100
    88  				// seconds. There are still issues that can cause a few
    89  				// elections to fail (the biggest one as I write this is
    90  				// #26448), so we must use a generous timeout here. We may be
    91  				// able to tighten the bounds as we make more improvements.
    92  				t.l.Printf("%s\n", buf)
    93  				t.Fatalf("expected query to succeed in less than %s, took %s", expected, duration)
    94  			}
    95  		},
    96  	})
    97  }