github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod-stress/main.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "bufio" 15 "bytes" 16 "context" 17 "flag" 18 "fmt" 19 "io" 20 "math/rand" 21 "os" 22 "os/exec" 23 "os/signal" 24 "path/filepath" 25 "regexp" 26 "runtime" 27 "strconv" 28 "strings" 29 "sync" 30 "sync/atomic" 31 "syscall" 32 "time" 33 34 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 35 "github.com/cockroachdb/errors" 36 ) 37 38 var ( 39 flags = flag.NewFlagSet(os.Args[0], flag.ContinueOnError) 40 flagP = flags.Int("p", runtime.NumCPU(), "run `N` processes in parallel") 41 flagTimeout = flags.Duration("timeout", 0, "timeout each process after `duration`") 42 _ = flags.Bool("kill", true, "kill timed out processes if true, otherwise just print pid (to attach with gdb)") 43 flagFailure = flags.String("failure", "", "fail only if output matches `regexp`") 44 flagIgnore = flags.String("ignore", "", "ignore failure if output matches `regexp`") 45 flagMaxTime = flags.Duration("maxtime", 0, "maximum time to run") 46 flagMaxRuns = flags.Int("maxruns", 0, "maximum number of runs") 47 _ = flags.Int("maxfails", 1, "maximum number of failures") 48 flagStderr = flags.Bool("stderr", true, "output failures to STDERR instead of to a temp file") 49 ) 50 51 func roundToSeconds(d time.Duration) time.Duration { 52 return time.Duration(d.Seconds()+0.5) * time.Second 53 } 54 55 func run() error { 56 flags.Usage = func() { 57 fmt.Fprintf(flags.Output(), "usage: %s <cluster> <pkg> [<flags>] -- [<args>]\n", flags.Name()) 58 flags.PrintDefaults() 59 } 60 61 if len(os.Args) < 2 { 62 var b bytes.Buffer 63 flags.SetOutput(&b) 64 flags.Usage() 65 return errors.Newf("%s", b.String()) 66 } 67 68 cluster := os.Args[1] 69 if err := flags.Parse(os.Args[2:]); err != nil { 70 return err 71 } 72 73 if !*flagStderr { 74 return errors.New("-stderr=false is unsupported, please tee to a file (or implement the feature)") 75 } 76 77 pkg := os.Args[2] 78 localTestBin := filepath.Base(pkg) + ".test" 79 { 80 fi, err := os.Stat(pkg) 81 if err != nil { 82 return fmt.Errorf("the pkg flag %q is not a directory relative to the current working directory: %v", pkg, err) 83 } 84 if !fi.Mode().IsDir() { 85 return fmt.Errorf("the pkg flag %q is not a directory relative to the current working directory", pkg) 86 } 87 88 // Verify that the test binary exists. 89 fi, err = os.Stat(localTestBin) 90 if err != nil { 91 return fmt.Errorf("test binary %q does not exist: %v", localTestBin, err) 92 } 93 if !fi.Mode().IsRegular() { 94 return fmt.Errorf("test binary %q is not a file", localTestBin) 95 } 96 } 97 flagsAndArgs := os.Args[3:] 98 stressArgs := flagsAndArgs 99 var testArgs []string 100 for i, arg := range flagsAndArgs { 101 if arg == "--" { 102 stressArgs = flagsAndArgs[:i] 103 testArgs = flagsAndArgs[i+1:] 104 break 105 } 106 } 107 108 if *flagP <= 0 || *flagTimeout < 0 || len(flags.Args()) == 0 { 109 var b bytes.Buffer 110 flags.SetOutput(&b) 111 flags.Usage() 112 return errors.Newf("%s", b.String()) 113 } 114 if *flagFailure != "" { 115 if _, err := regexp.Compile(*flagFailure); err != nil { 116 return fmt.Errorf("bad failure regexp: %s", err) 117 } 118 } 119 if *flagIgnore != "" { 120 if _, err := regexp.Compile(*flagIgnore); err != nil { 121 return fmt.Errorf("bad ignore regexp: %s", err) 122 } 123 } 124 125 cmd := exec.Command("roachprod", "status", cluster) 126 out, err := cmd.CombinedOutput() 127 if err != nil { 128 return fmt.Errorf("%v\n%s", err, out) 129 } 130 nodes := strings.Count(string(out), "\n") - 1 131 132 const stressBin = "bin.docker_amd64/stress" 133 134 cmd = exec.Command("roachprod", "put", cluster, stressBin) 135 cmd.Stdout = os.Stdout 136 cmd.Stderr = os.Stderr 137 if err := cmd.Run(); err != nil { 138 return err 139 } 140 141 cmd = exec.Command("roachprod", "run", cluster, "mkdir -p "+pkg) 142 if err := cmd.Run(); err != nil { 143 return err 144 } 145 testdataPath := filepath.Join(pkg, "testdata") 146 if _, err := os.Stat(testdataPath); err == nil { 147 // roachprod put has bizarre semantics for putting directories anywhere 148 // other than the home directory. To deal with this we put the directory 149 // in the home directory and then move it. 150 tmpPath := "testdata" + strconv.Itoa(rand.Int()) 151 cmd = exec.Command("roachprod", "run", cluster, "--", "rm", "-rf", testdataPath) 152 if output, err := cmd.CombinedOutput(); err != nil { 153 return fmt.Errorf("failed to remove old testdata: %v:\n%s", err, output) 154 } 155 cmd = exec.Command("roachprod", "put", cluster, testdataPath, tmpPath) 156 if err := cmd.Run(); err != nil { 157 return fmt.Errorf("failed to copy testdata: %v", err) 158 } 159 cmd = exec.Command("roachprod", "run", cluster, "mv", tmpPath, testdataPath) 160 if err := cmd.Run(); err != nil { 161 return fmt.Errorf("failed to move testdata: %v", err) 162 } 163 } 164 testBin := filepath.Join(pkg, localTestBin) 165 cmd = exec.Command("roachprod", "put", cluster, localTestBin, testBin) 166 cmd.Stdout = os.Stdout 167 cmd.Stderr = os.Stderr 168 if err := cmd.Run(); err != nil { 169 return err 170 } 171 172 c := make(chan os.Signal) 173 defer close(c) 174 signal.Notify(c, os.Interrupt) 175 signal.Notify(c, syscall.SIGHUP, syscall.SIGTERM) 176 defer signal.Stop(c) 177 178 startTime := timeutil.Now() 179 ctx, cancel := func(ctx context.Context) (context.Context, context.CancelFunc) { 180 if *flagMaxTime > 0 { 181 return context.WithTimeout(ctx, *flagMaxTime) 182 } 183 return context.WithCancel(ctx) 184 }(context.Background()) 185 defer cancel() 186 187 // NB: We don't use CommandContext below because it will `kill -9` the 188 // `roachprod ssh` processes. Rather, we watch for the context being canceled 189 // (or timing out) and explicitly stop the remote stress tests. 190 go func() { 191 <-ctx.Done() 192 fmt.Printf("shutting down\n") 193 _ = exec.Command("roachprod", "stop", cluster).Run() 194 }() 195 196 go func() { 197 for range c { 198 cancel() 199 } 200 }() 201 202 var wg sync.WaitGroup 203 defer wg.Wait() 204 205 var runs, fails int32 206 res := make(chan string) 207 error := func(s string) { 208 select { 209 case <-ctx.Done(): 210 case res <- s: 211 } 212 } 213 214 statusRE := regexp.MustCompile(`(\d+) runs (so far|completed), (\d+) failures, over .*`) 215 216 wg.Add(nodes) 217 for i := 1; i <= nodes; i++ { 218 go func(i int) { 219 stdoutR, stdoutW := io.Pipe() 220 defer func() { 221 _ = stdoutW.Close() 222 wg.Done() 223 }() 224 225 go func() { 226 defer func() { 227 _ = stdoutR.Close() 228 }() 229 230 var lastRuns, lastFails int 231 scanner := bufio.NewScanner(stdoutR) 232 for scanner.Scan() { 233 m := statusRE.FindStringSubmatch(scanner.Text()) 234 if m == nil { 235 continue 236 } 237 curRuns, err := strconv.Atoi(m[1]) 238 if err != nil { 239 error(fmt.Sprintf("%s", err)) 240 return 241 } 242 curFails, err := strconv.Atoi(m[3]) 243 if err != nil { 244 error(fmt.Sprintf("%s", err)) 245 return 246 } 247 if m[2] == "completed" { 248 break 249 } 250 251 atomic.AddInt32(&runs, int32(curRuns-lastRuns)) 252 atomic.AddInt32(&fails, int32(curFails-lastFails)) 253 lastRuns, lastFails = curRuns, curFails 254 255 if *flagMaxRuns > 0 && int(atomic.LoadInt32(&runs)) >= *flagMaxRuns { 256 cancel() 257 } 258 } 259 }() 260 var stderr bytes.Buffer 261 cmd := exec.Command("roachprod", 262 "ssh", fmt.Sprintf("%s:%d", cluster, i), "--", 263 fmt.Sprintf("cd %s; GOTRACEBACK=all ~/stress %s ./%s %s", 264 pkg, 265 strings.Join(stressArgs, " "), 266 filepath.Base(testBin), 267 strings.Join(testArgs, " "))) 268 cmd.Stdout = stdoutW 269 cmd.Stderr = &stderr 270 if err := cmd.Run(); err != nil { 271 error(stderr.String()) 272 } 273 }(i) 274 } 275 276 ticker := time.NewTicker(5 * time.Second).C 277 for { 278 select { 279 case out := <-res: 280 cancel() 281 fmt.Fprintf(os.Stderr, "\n%s\n", out) 282 case <-ticker: 283 fmt.Printf("%v runs so far, %v failures, over %s\n", 284 atomic.LoadInt32(&runs), atomic.LoadInt32(&fails), 285 roundToSeconds(timeutil.Since(startTime))) 286 case <-ctx.Done(): 287 fmt.Printf("%v runs completed, %v failures, over %s\n", 288 atomic.LoadInt32(&runs), atomic.LoadInt32(&fails), 289 roundToSeconds(timeutil.Since(startTime))) 290 291 err := ctx.Err() 292 switch { 293 // A context timeout in this case is indicative of no failures 294 // being detected in the allotted duration. 295 case errors.Is(err, context.DeadlineExceeded): 296 return nil 297 case errors.Is(err, context.Canceled): 298 if *flagMaxRuns > 0 && int(atomic.LoadInt32(&runs)) >= *flagMaxRuns { 299 return nil 300 } 301 return err 302 default: 303 return fmt.Errorf("unexpected context error: %v", err) 304 } 305 } 306 } 307 } 308 309 func main() { 310 if err := run(); err != nil { 311 fmt.Fprintln(os.Stderr, err) 312 fmt.Println("FAIL") 313 os.Exit(1) 314 } else { 315 fmt.Println("SUCCESS") 316 } 317 }