github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/clearrange.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    19  )
    20  
    21  func registerClearRange(r *testRegistry) {
    22  	for _, checks := range []bool{true, false} {
    23  		checks := checks
    24  		r.Add(testSpec{
    25  			Name:  fmt.Sprintf(`clearrange/checks=%t`, checks),
    26  			Owner: OwnerStorage,
    27  			// 5h for import, 90 for the test. The import should take closer
    28  			// to <3:30h but it varies.
    29  			Timeout:    5*time.Hour + 90*time.Minute,
    30  			MinVersion: "v19.1.0",
    31  			Cluster:    makeClusterSpec(10),
    32  			Run: func(ctx context.Context, t *test, c *cluster) {
    33  				runClearRange(ctx, t, c, checks)
    34  			},
    35  		})
    36  	}
    37  }
    38  
    39  func runClearRange(ctx context.Context, t *test, c *cluster, aggressiveChecks bool) {
    40  	c.Put(ctx, cockroach, "./cockroach")
    41  
    42  	t.Status("restoring fixture")
    43  	c.Start(ctx, t)
    44  
    45  	// NB: on a 10 node cluster, this should take well below 3h.
    46  	tBegin := timeutil.Now()
    47  	c.Run(ctx, c.Node(1), "./cockroach", "workload", "fixtures", "import", "bank",
    48  		"--payload-bytes=10240", "--ranges=10", "--rows=65104166", "--seed=4", "--db=bigbank")
    49  	c.l.Printf("import took %.2fs", timeutil.Since(tBegin).Seconds())
    50  	c.Stop(ctx)
    51  	t.Status()
    52  
    53  	if aggressiveChecks {
    54  		// Run with an env var that runs a synchronous consistency check after each rebalance and merge.
    55  		// This slows down merges, so it might hide some races.
    56  		//
    57  		// NB: the below invocation was found to actually make it to the server at the time of writing.
    58  		c.Start(ctx, t, startArgs(
    59  			"--env", "COCKROACH_CONSISTENCY_AGGRESSIVE=true COCKROACH_ENFORCE_CONSISTENT_STATS=true",
    60  		))
    61  	} else {
    62  		c.Start(ctx, t)
    63  	}
    64  
    65  	// Also restore a much smaller table. We'll use it to run queries against
    66  	// the cluster after having dropped the large table above, verifying that
    67  	// the  cluster still works.
    68  	t.Status(`restoring tiny table`)
    69  	defer t.WorkerStatus()
    70  
    71  	// Use a 120s connect timeout to work around the fact that the server will
    72  	// declare itself ready before it's actually 100% ready. See:
    73  	// https://github.com/cockroachdb/cockroach/issues/34897#issuecomment-465089057
    74  	c.Run(ctx, c.Node(1), `COCKROACH_CONNECT_TIMEOUT=120 ./cockroach sql --insecure -e "DROP DATABASE IF EXISTS tinybank"`)
    75  	c.Run(ctx, c.Node(1), "./cockroach", "workload", "fixtures", "import", "bank", "--db=tinybank",
    76  		"--payload-bytes=100", "--ranges=10", "--rows=800", "--seed=1")
    77  
    78  	t.Status()
    79  
    80  	// Set up a convenience function that we can call to learn the number of
    81  	// ranges for the bigbank.bank table (even after it's been dropped).
    82  	numBankRanges := func() func() int {
    83  		conn := c.Conn(ctx, 1)
    84  		defer conn.Close()
    85  
    86  		var startHex string
    87  		if err := conn.QueryRow(
    88  			`SELECT to_hex(start_key) FROM crdb_internal.ranges_no_leases WHERE database_name = 'bigbank' AND table_name = 'bank' ORDER BY start_key ASC LIMIT 1`,
    89  		).Scan(&startHex); err != nil {
    90  			t.Fatal(err)
    91  		}
    92  		return func() int {
    93  			conn := c.Conn(ctx, 1)
    94  			defer conn.Close()
    95  			var n int
    96  			if err := conn.QueryRow(
    97  				`SELECT count(*) FROM crdb_internal.ranges_no_leases WHERE substr(to_hex(start_key), 1, length($1::string)) = $1`, startHex,
    98  			).Scan(&n); err != nil {
    99  				t.Fatal(err)
   100  			}
   101  			return n
   102  		}
   103  	}()
   104  
   105  	m := newMonitor(ctx, c)
   106  	m.Go(func(ctx context.Context) error {
   107  		conn := c.Conn(ctx, 1)
   108  		defer conn.Close()
   109  
   110  		if _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.range_merge.queue_enabled = true`); err != nil {
   111  			return err
   112  		}
   113  
   114  		// Merge as fast as possible to put maximum stress on the system.
   115  		if _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.range_merge.queue_interval = '0s'`); err != nil {
   116  			return err
   117  		}
   118  
   119  		t.WorkerStatus("dropping table")
   120  		defer t.WorkerStatus()
   121  
   122  		// Set a low TTL so that the ClearRange-based cleanup mechanism can kick in earlier.
   123  		// This could also be done after dropping the table.
   124  		if _, err := conn.ExecContext(ctx, `ALTER TABLE bigbank.bank CONFIGURE ZONE USING gc.ttlseconds = 30`); err != nil {
   125  			return err
   126  		}
   127  
   128  		t.WorkerStatus("computing number of ranges")
   129  		initialBankRanges := numBankRanges()
   130  
   131  		t.WorkerStatus("dropping bank table")
   132  		if _, err := conn.ExecContext(ctx, `DROP TABLE bigbank.bank`); err != nil {
   133  			return err
   134  		}
   135  
   136  		// Spend some time reading data with a timeout to make sure the
   137  		// DROP above didn't brick the cluster. At the time of writing,
   138  		// clearing all of the table data takes ~6min, so we want to run
   139  		// for at least a multiple of that duration.
   140  		const minDuration = 45 * time.Minute
   141  		deadline := timeutil.Now().Add(minDuration)
   142  		curBankRanges := numBankRanges()
   143  		t.WorkerStatus("waiting for ~", curBankRanges, " merges to complete (and for at least ", minDuration, " to pass)")
   144  		for timeutil.Now().Before(deadline) || curBankRanges > 1 {
   145  			after := time.After(5 * time.Minute)
   146  			curBankRanges = numBankRanges() // this call takes minutes, unfortunately
   147  			t.WorkerProgress(1 - float64(curBankRanges)/float64(initialBankRanges))
   148  
   149  			var count int
   150  			// NB: context cancellation in QueryRowContext does not work as expected.
   151  			// See #25435.
   152  			if _, err := conn.ExecContext(ctx, `SET statement_timeout = '5s'`); err != nil {
   153  				return err
   154  			}
   155  			// If we can't aggregate over 80kb in 5s, the database is far from usable.
   156  			if err := conn.QueryRowContext(ctx, `SELECT count(*) FROM tinybank.bank`).Scan(&count); err != nil {
   157  				return err
   158  			}
   159  
   160  			select {
   161  			case <-after:
   162  			case <-ctx.Done():
   163  				return ctx.Err()
   164  			}
   165  		}
   166  		// TODO(tschottdorf): verify that disk space usage drops below to <some small amount>, but that
   167  		// may not actually happen (see https://github.com/cockroachdb/cockroach/issues/29290).
   168  		return nil
   169  	})
   170  	m.Wait()
   171  }