github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cli/debug_synctest.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package cli 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "math/rand" 18 "os" 19 "os/exec" 20 "os/signal" 21 "path/filepath" 22 "strconv" 23 "time" 24 25 "github.com/cockroachdb/cockroach/pkg/roachpb" 26 "github.com/cockroachdb/cockroach/pkg/storage" 27 "github.com/cockroachdb/cockroach/pkg/util/encoding" 28 "github.com/cockroachdb/cockroach/pkg/util/stop" 29 "github.com/cockroachdb/errors" 30 "github.com/spf13/cobra" 31 ) 32 33 var debugSyncTestCmd = &cobra.Command{ 34 Use: "synctest <empty-dir> <nemesis-script>", 35 Short: "Run a log-like workload that can help expose filesystem anomalies", 36 Long: ` 37 synctest is a tool to verify filesystem consistency in the presence of I/O errors. 38 It takes a directory (required to be initially empty and created on demand) into 39 which data will be written and a nemesis script which receives a single argument 40 that is either "on" or "off". 41 42 The nemesis script will be called with a parameter of "on" when the filesystem 43 underlying the given directory should be "disturbed". It is called with "off" 44 to restore the undisturbed state (note that "off" must be idempotent). 45 46 synctest will run run across multiple "epochs", each terminated by an I/O error 47 injected by the nemesis. After each epoch, the nemesis is turned off and the 48 written data is reopened, checked for data loss, new data is written, and 49 the nemesis turned back on. In the absence of unexpected error or user interrupt, 50 this process continues indefinitely. 51 `, 52 Args: cobra.ExactArgs(2), 53 RunE: runDebugSyncTest, 54 } 55 56 type scriptNemesis string 57 58 func (sn scriptNemesis) exec(arg string) error { 59 b, err := exec.Command(string(sn), arg).CombinedOutput() 60 if err != nil { 61 return errors.WithDetailf(err, "command output:\n%s", string(b)) 62 } 63 fmt.Fprintf(stderr, "%s %s: %s", sn, arg, b) 64 return nil 65 } 66 67 func (sn scriptNemesis) On() error { 68 return sn.exec("on") 69 } 70 71 func (sn scriptNemesis) Off() error { 72 return sn.exec("off") 73 } 74 75 func runDebugSyncTest(cmd *cobra.Command, args []string) error { 76 // TODO(tschottdorf): make this a flag. 77 duration := 10 * time.Minute 78 79 ctx, cancel := context.WithTimeout(context.Background(), duration) 80 defer cancel() 81 82 nem := scriptNemesis(args[1]) 83 if err := nem.Off(); err != nil { 84 return errors.Wrap(err, "unable to disable nemesis at beginning of run") 85 } 86 87 var generation int 88 var lastSeq int64 89 for { 90 dir := filepath.Join(args[0], strconv.Itoa(generation)) 91 curLastSeq, err := runSyncer(ctx, dir, lastSeq, nem) 92 if err != nil { 93 return err 94 } 95 lastSeq = curLastSeq 96 if curLastSeq == 0 { 97 if ctx.Err() != nil { 98 // Clean shutdown. 99 return nil 100 } 101 // RocksDB dir got corrupted. 102 generation++ 103 continue 104 } 105 } 106 } 107 108 type nemesisI interface { 109 On() error 110 Off() error 111 } 112 113 func runSyncer( 114 ctx context.Context, dir string, expSeq int64, nemesis nemesisI, 115 ) (lastSeq int64, _ error) { 116 stopper := stop.NewStopper() 117 defer stopper.Stop(ctx) 118 119 db, err := OpenEngine(dir, stopper, OpenEngineOptions{}) 120 if err != nil { 121 if expSeq == 0 { 122 // Failed on first open, before we tried to corrupt anything. Hard stop. 123 return 0, err 124 } 125 fmt.Fprintln(stderr, "RocksDB directory", dir, "corrupted:", err) 126 return 0, nil // trigger reset 127 } 128 129 buf := make([]byte, 128) 130 var seq int64 131 key := func() roachpb.Key { 132 seq++ 133 return encoding.EncodeUvarintAscending(buf[:0:0], uint64(seq)) 134 } 135 136 check := func(kv storage.MVCCKeyValue) (bool, error) { 137 expKey := key() 138 if !bytes.Equal(kv.Key.Key, expKey) { 139 return false, errors.Errorf( 140 "found unexpected key %q (expected %q)", kv.Key.Key, expKey, 141 ) 142 } 143 return false, nil // want more 144 } 145 146 fmt.Fprintf(stderr, "verifying existing sequence numbers...") 147 if err := db.Iterate(roachpb.KeyMin, roachpb.KeyMax, check); err != nil { 148 return 0, err 149 } 150 // We must not lose writes, but sometimes we get extra ones (i.e. we caught an 151 // error but the write actually went through). 152 if expSeq != 0 && seq < expSeq { 153 return 0, errors.Errorf("highest persisted sequence number is %d, but expected at least %d", seq, expSeq) 154 } 155 fmt.Fprintf(stderr, "done (seq=%d).\nWriting new entries:\n", seq) 156 157 waitFailure := time.After(time.Duration(rand.Int63n(5 * time.Second.Nanoseconds()))) 158 159 stopper.RunWorker(ctx, func(ctx context.Context) { 160 <-waitFailure 161 if err := nemesis.On(); err != nil { 162 panic(err) 163 } 164 defer func() { 165 if err := nemesis.Off(); err != nil { 166 panic(err) 167 } 168 }() 169 <-stopper.ShouldQuiesce() 170 }) 171 172 ch := make(chan os.Signal, 1) 173 signal.Notify(ch, drainSignals...) 174 175 write := func() (_ int64, err error) { 176 defer func() { 177 // Catch any RocksDB NPEs. They do occur when enough 178 // faults are being injected. 179 if r := recover(); r != nil { 180 if err == nil { 181 err = errors.New("recovered panic on write") 182 } 183 err = errors.Wrapf(err, "%v", r) 184 } 185 }() 186 187 k, v := storage.MakeMVCCMetadataKey(key()), []byte("payload") 188 switch seq % 2 { 189 case 0: 190 if err := db.Put(k, v); err != nil { 191 seq-- 192 return seq, err 193 } 194 if err := db.Flush(); err != nil { 195 seq-- 196 return seq, err 197 } 198 default: 199 b := db.NewBatch() 200 if err := b.Put(k, v); err != nil { 201 seq-- 202 return seq, err 203 } 204 if err := b.Commit(true /* sync */); err != nil { 205 seq-- 206 return seq, err 207 } 208 } 209 return seq, nil 210 } 211 212 for { 213 if lastSeq, err := write(); err != nil { 214 // Exercise three cases: 215 // 1. no more writes after first failure 216 // 2. one more attempt, failure persists 217 // 3. two more attempts, file system healed for last attempt 218 for n := rand.Intn(3); n >= 0; n-- { 219 if n == 1 { 220 if err := nemesis.Off(); err != nil { 221 return 0, err 222 } 223 } 224 fmt.Fprintf(stderr, "error after seq %d (trying %d additional writes): %v\n", lastSeq, n, err) 225 lastSeq, err = write() 226 } 227 fmt.Fprintf(stderr, "error after seq %d: %v\n", lastSeq, err) 228 // Intentionally swallow the error to get into the next epoch. 229 return lastSeq, nil 230 } 231 select { 232 case sig := <-ch: 233 return seq, errors.Errorf("interrupted (%v)", sig) 234 case <-ctx.Done(): 235 return 0, nil 236 default: 237 } 238 } 239 }