github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/clearrange.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 "fmt" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 19 ) 20 21 func registerClearRange(r *testRegistry) { 22 for _, checks := range []bool{true, false} { 23 checks := checks 24 r.Add(testSpec{ 25 Name: fmt.Sprintf(`clearrange/checks=%t`, checks), 26 Owner: OwnerStorage, 27 // 5h for import, 90 for the test. The import should take closer 28 // to <3:30h but it varies. 29 Timeout: 5*time.Hour + 90*time.Minute, 30 MinVersion: "v19.1.0", 31 Cluster: makeClusterSpec(10), 32 Run: func(ctx context.Context, t *test, c *cluster) { 33 runClearRange(ctx, t, c, checks) 34 }, 35 }) 36 } 37 } 38 39 func runClearRange(ctx context.Context, t *test, c *cluster, aggressiveChecks bool) { 40 c.Put(ctx, cockroach, "./cockroach") 41 42 t.Status("restoring fixture") 43 c.Start(ctx, t) 44 45 // NB: on a 10 node cluster, this should take well below 3h. 46 tBegin := timeutil.Now() 47 c.Run(ctx, c.Node(1), "./cockroach", "workload", "fixtures", "import", "bank", 48 "--payload-bytes=10240", "--ranges=10", "--rows=65104166", "--seed=4", "--db=bigbank") 49 c.l.Printf("import took %.2fs", timeutil.Since(tBegin).Seconds()) 50 c.Stop(ctx) 51 t.Status() 52 53 if aggressiveChecks { 54 // Run with an env var that runs a synchronous consistency check after each rebalance and merge. 55 // This slows down merges, so it might hide some races. 56 // 57 // NB: the below invocation was found to actually make it to the server at the time of writing. 58 c.Start(ctx, t, startArgs( 59 "--env", "COCKROACH_CONSISTENCY_AGGRESSIVE=true COCKROACH_ENFORCE_CONSISTENT_STATS=true", 60 )) 61 } else { 62 c.Start(ctx, t) 63 } 64 65 // Also restore a much smaller table. We'll use it to run queries against 66 // the cluster after having dropped the large table above, verifying that 67 // the cluster still works. 68 t.Status(`restoring tiny table`) 69 defer t.WorkerStatus() 70 71 // Use a 120s connect timeout to work around the fact that the server will 72 // declare itself ready before it's actually 100% ready. See: 73 // https://github.com/cockroachdb/cockroach/issues/34897#issuecomment-465089057 74 c.Run(ctx, c.Node(1), `COCKROACH_CONNECT_TIMEOUT=120 ./cockroach sql --insecure -e "DROP DATABASE IF EXISTS tinybank"`) 75 c.Run(ctx, c.Node(1), "./cockroach", "workload", "fixtures", "import", "bank", "--db=tinybank", 76 "--payload-bytes=100", "--ranges=10", "--rows=800", "--seed=1") 77 78 t.Status() 79 80 // Set up a convenience function that we can call to learn the number of 81 // ranges for the bigbank.bank table (even after it's been dropped). 82 numBankRanges := func() func() int { 83 conn := c.Conn(ctx, 1) 84 defer conn.Close() 85 86 var startHex string 87 if err := conn.QueryRow( 88 `SELECT to_hex(start_key) FROM crdb_internal.ranges_no_leases WHERE database_name = 'bigbank' AND table_name = 'bank' ORDER BY start_key ASC LIMIT 1`, 89 ).Scan(&startHex); err != nil { 90 t.Fatal(err) 91 } 92 return func() int { 93 conn := c.Conn(ctx, 1) 94 defer conn.Close() 95 var n int 96 if err := conn.QueryRow( 97 `SELECT count(*) FROM crdb_internal.ranges_no_leases WHERE substr(to_hex(start_key), 1, length($1::string)) = $1`, startHex, 98 ).Scan(&n); err != nil { 99 t.Fatal(err) 100 } 101 return n 102 } 103 }() 104 105 m := newMonitor(ctx, c) 106 m.Go(func(ctx context.Context) error { 107 conn := c.Conn(ctx, 1) 108 defer conn.Close() 109 110 if _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.range_merge.queue_enabled = true`); err != nil { 111 return err 112 } 113 114 // Merge as fast as possible to put maximum stress on the system. 115 if _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.range_merge.queue_interval = '0s'`); err != nil { 116 return err 117 } 118 119 t.WorkerStatus("dropping table") 120 defer t.WorkerStatus() 121 122 // Set a low TTL so that the ClearRange-based cleanup mechanism can kick in earlier. 123 // This could also be done after dropping the table. 124 if _, err := conn.ExecContext(ctx, `ALTER TABLE bigbank.bank CONFIGURE ZONE USING gc.ttlseconds = 30`); err != nil { 125 return err 126 } 127 128 t.WorkerStatus("computing number of ranges") 129 initialBankRanges := numBankRanges() 130 131 t.WorkerStatus("dropping bank table") 132 if _, err := conn.ExecContext(ctx, `DROP TABLE bigbank.bank`); err != nil { 133 return err 134 } 135 136 // Spend some time reading data with a timeout to make sure the 137 // DROP above didn't brick the cluster. At the time of writing, 138 // clearing all of the table data takes ~6min, so we want to run 139 // for at least a multiple of that duration. 140 const minDuration = 45 * time.Minute 141 deadline := timeutil.Now().Add(minDuration) 142 curBankRanges := numBankRanges() 143 t.WorkerStatus("waiting for ~", curBankRanges, " merges to complete (and for at least ", minDuration, " to pass)") 144 for timeutil.Now().Before(deadline) || curBankRanges > 1 { 145 after := time.After(5 * time.Minute) 146 curBankRanges = numBankRanges() // this call takes minutes, unfortunately 147 t.WorkerProgress(1 - float64(curBankRanges)/float64(initialBankRanges)) 148 149 var count int 150 // NB: context cancellation in QueryRowContext does not work as expected. 151 // See #25435. 152 if _, err := conn.ExecContext(ctx, `SET statement_timeout = '5s'`); err != nil { 153 return err 154 } 155 // If we can't aggregate over 80kb in 5s, the database is far from usable. 156 if err := conn.QueryRowContext(ctx, `SELECT count(*) FROM tinybank.bank`).Scan(&count); err != nil { 157 return err 158 } 159 160 select { 161 case <-after: 162 case <-ctx.Done(): 163 return ctx.Err() 164 } 165 } 166 // TODO(tschottdorf): verify that disk space usage drops below to <some small amount>, but that 167 // may not actually happen (see https://github.com/cockroachdb/cockroach/issues/29290). 168 return nil 169 }) 170 m.Wait() 171 }