github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod-stress/main.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"bufio"
    15  	"bytes"
    16  	"context"
    17  	"flag"
    18  	"fmt"
    19  	"io"
    20  	"math/rand"
    21  	"os"
    22  	"os/exec"
    23  	"os/signal"
    24  	"path/filepath"
    25  	"regexp"
    26  	"runtime"
    27  	"strconv"
    28  	"strings"
    29  	"sync"
    30  	"sync/atomic"
    31  	"syscall"
    32  	"time"
    33  
    34  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    35  	"github.com/cockroachdb/errors"
    36  )
    37  
    38  var (
    39  	flags       = flag.NewFlagSet(os.Args[0], flag.ContinueOnError)
    40  	flagP       = flags.Int("p", runtime.NumCPU(), "run `N` processes in parallel")
    41  	flagTimeout = flags.Duration("timeout", 0, "timeout each process after `duration`")
    42  	_           = flags.Bool("kill", true, "kill timed out processes if true, otherwise just print pid (to attach with gdb)")
    43  	flagFailure = flags.String("failure", "", "fail only if output matches `regexp`")
    44  	flagIgnore  = flags.String("ignore", "", "ignore failure if output matches `regexp`")
    45  	flagMaxTime = flags.Duration("maxtime", 0, "maximum time to run")
    46  	flagMaxRuns = flags.Int("maxruns", 0, "maximum number of runs")
    47  	_           = flags.Int("maxfails", 1, "maximum number of failures")
    48  	flagStderr  = flags.Bool("stderr", true, "output failures to STDERR instead of to a temp file")
    49  )
    50  
    51  func roundToSeconds(d time.Duration) time.Duration {
    52  	return time.Duration(d.Seconds()+0.5) * time.Second
    53  }
    54  
    55  func run() error {
    56  	flags.Usage = func() {
    57  		fmt.Fprintf(flags.Output(), "usage: %s <cluster> <pkg> [<flags>] -- [<args>]\n", flags.Name())
    58  		flags.PrintDefaults()
    59  	}
    60  
    61  	if len(os.Args) < 2 {
    62  		var b bytes.Buffer
    63  		flags.SetOutput(&b)
    64  		flags.Usage()
    65  		return errors.Newf("%s", b.String())
    66  	}
    67  
    68  	cluster := os.Args[1]
    69  	if err := flags.Parse(os.Args[2:]); err != nil {
    70  		return err
    71  	}
    72  
    73  	if !*flagStderr {
    74  		return errors.New("-stderr=false is unsupported, please tee to a file (or implement the feature)")
    75  	}
    76  
    77  	pkg := os.Args[2]
    78  	localTestBin := filepath.Base(pkg) + ".test"
    79  	{
    80  		fi, err := os.Stat(pkg)
    81  		if err != nil {
    82  			return fmt.Errorf("the pkg flag %q is not a directory relative to the current working directory: %v", pkg, err)
    83  		}
    84  		if !fi.Mode().IsDir() {
    85  			return fmt.Errorf("the pkg flag %q is not a directory relative to the current working directory", pkg)
    86  		}
    87  
    88  		// Verify that the test binary exists.
    89  		fi, err = os.Stat(localTestBin)
    90  		if err != nil {
    91  			return fmt.Errorf("test binary %q does not exist: %v", localTestBin, err)
    92  		}
    93  		if !fi.Mode().IsRegular() {
    94  			return fmt.Errorf("test binary %q is not a file", localTestBin)
    95  		}
    96  	}
    97  	flagsAndArgs := os.Args[3:]
    98  	stressArgs := flagsAndArgs
    99  	var testArgs []string
   100  	for i, arg := range flagsAndArgs {
   101  		if arg == "--" {
   102  			stressArgs = flagsAndArgs[:i]
   103  			testArgs = flagsAndArgs[i+1:]
   104  			break
   105  		}
   106  	}
   107  
   108  	if *flagP <= 0 || *flagTimeout < 0 || len(flags.Args()) == 0 {
   109  		var b bytes.Buffer
   110  		flags.SetOutput(&b)
   111  		flags.Usage()
   112  		return errors.Newf("%s", b.String())
   113  	}
   114  	if *flagFailure != "" {
   115  		if _, err := regexp.Compile(*flagFailure); err != nil {
   116  			return fmt.Errorf("bad failure regexp: %s", err)
   117  		}
   118  	}
   119  	if *flagIgnore != "" {
   120  		if _, err := regexp.Compile(*flagIgnore); err != nil {
   121  			return fmt.Errorf("bad ignore regexp: %s", err)
   122  		}
   123  	}
   124  
   125  	cmd := exec.Command("roachprod", "status", cluster)
   126  	out, err := cmd.CombinedOutput()
   127  	if err != nil {
   128  		return fmt.Errorf("%v\n%s", err, out)
   129  	}
   130  	nodes := strings.Count(string(out), "\n") - 1
   131  
   132  	const stressBin = "bin.docker_amd64/stress"
   133  
   134  	cmd = exec.Command("roachprod", "put", cluster, stressBin)
   135  	cmd.Stdout = os.Stdout
   136  	cmd.Stderr = os.Stderr
   137  	if err := cmd.Run(); err != nil {
   138  		return err
   139  	}
   140  
   141  	cmd = exec.Command("roachprod", "run", cluster, "mkdir -p "+pkg)
   142  	if err := cmd.Run(); err != nil {
   143  		return err
   144  	}
   145  	testdataPath := filepath.Join(pkg, "testdata")
   146  	if _, err := os.Stat(testdataPath); err == nil {
   147  		// roachprod put has bizarre semantics for putting directories anywhere
   148  		// other than the home directory. To deal with this we put the directory
   149  		// in the home directory and then move it.
   150  		tmpPath := "testdata" + strconv.Itoa(rand.Int())
   151  		cmd = exec.Command("roachprod", "run", cluster, "--", "rm", "-rf", testdataPath)
   152  		if output, err := cmd.CombinedOutput(); err != nil {
   153  			return fmt.Errorf("failed to remove old testdata: %v:\n%s", err, output)
   154  		}
   155  		cmd = exec.Command("roachprod", "put", cluster, testdataPath, tmpPath)
   156  		if err := cmd.Run(); err != nil {
   157  			return fmt.Errorf("failed to copy testdata: %v", err)
   158  		}
   159  		cmd = exec.Command("roachprod", "run", cluster, "mv", tmpPath, testdataPath)
   160  		if err := cmd.Run(); err != nil {
   161  			return fmt.Errorf("failed to move testdata: %v", err)
   162  		}
   163  	}
   164  	testBin := filepath.Join(pkg, localTestBin)
   165  	cmd = exec.Command("roachprod", "put", cluster, localTestBin, testBin)
   166  	cmd.Stdout = os.Stdout
   167  	cmd.Stderr = os.Stderr
   168  	if err := cmd.Run(); err != nil {
   169  		return err
   170  	}
   171  
   172  	c := make(chan os.Signal)
   173  	defer close(c)
   174  	signal.Notify(c, os.Interrupt)
   175  	signal.Notify(c, syscall.SIGHUP, syscall.SIGTERM)
   176  	defer signal.Stop(c)
   177  
   178  	startTime := timeutil.Now()
   179  	ctx, cancel := func(ctx context.Context) (context.Context, context.CancelFunc) {
   180  		if *flagMaxTime > 0 {
   181  			return context.WithTimeout(ctx, *flagMaxTime)
   182  		}
   183  		return context.WithCancel(ctx)
   184  	}(context.Background())
   185  	defer cancel()
   186  
   187  	// NB: We don't use CommandContext below because it will `kill -9` the
   188  	// `roachprod ssh` processes. Rather, we watch for the context being canceled
   189  	// (or timing out) and explicitly stop the remote stress tests.
   190  	go func() {
   191  		<-ctx.Done()
   192  		fmt.Printf("shutting down\n")
   193  		_ = exec.Command("roachprod", "stop", cluster).Run()
   194  	}()
   195  
   196  	go func() {
   197  		for range c {
   198  			cancel()
   199  		}
   200  	}()
   201  
   202  	var wg sync.WaitGroup
   203  	defer wg.Wait()
   204  
   205  	var runs, fails int32
   206  	res := make(chan string)
   207  	error := func(s string) {
   208  		select {
   209  		case <-ctx.Done():
   210  		case res <- s:
   211  		}
   212  	}
   213  
   214  	statusRE := regexp.MustCompile(`(\d+) runs (so far|completed), (\d+) failures, over .*`)
   215  
   216  	wg.Add(nodes)
   217  	for i := 1; i <= nodes; i++ {
   218  		go func(i int) {
   219  			stdoutR, stdoutW := io.Pipe()
   220  			defer func() {
   221  				_ = stdoutW.Close()
   222  				wg.Done()
   223  			}()
   224  
   225  			go func() {
   226  				defer func() {
   227  					_ = stdoutR.Close()
   228  				}()
   229  
   230  				var lastRuns, lastFails int
   231  				scanner := bufio.NewScanner(stdoutR)
   232  				for scanner.Scan() {
   233  					m := statusRE.FindStringSubmatch(scanner.Text())
   234  					if m == nil {
   235  						continue
   236  					}
   237  					curRuns, err := strconv.Atoi(m[1])
   238  					if err != nil {
   239  						error(fmt.Sprintf("%s", err))
   240  						return
   241  					}
   242  					curFails, err := strconv.Atoi(m[3])
   243  					if err != nil {
   244  						error(fmt.Sprintf("%s", err))
   245  						return
   246  					}
   247  					if m[2] == "completed" {
   248  						break
   249  					}
   250  
   251  					atomic.AddInt32(&runs, int32(curRuns-lastRuns))
   252  					atomic.AddInt32(&fails, int32(curFails-lastFails))
   253  					lastRuns, lastFails = curRuns, curFails
   254  
   255  					if *flagMaxRuns > 0 && int(atomic.LoadInt32(&runs)) >= *flagMaxRuns {
   256  						cancel()
   257  					}
   258  				}
   259  			}()
   260  			var stderr bytes.Buffer
   261  			cmd := exec.Command("roachprod",
   262  				"ssh", fmt.Sprintf("%s:%d", cluster, i), "--",
   263  				fmt.Sprintf("cd %s; GOTRACEBACK=all ~/stress %s ./%s %s",
   264  					pkg,
   265  					strings.Join(stressArgs, " "),
   266  					filepath.Base(testBin),
   267  					strings.Join(testArgs, " ")))
   268  			cmd.Stdout = stdoutW
   269  			cmd.Stderr = &stderr
   270  			if err := cmd.Run(); err != nil {
   271  				error(stderr.String())
   272  			}
   273  		}(i)
   274  	}
   275  
   276  	ticker := time.NewTicker(5 * time.Second).C
   277  	for {
   278  		select {
   279  		case out := <-res:
   280  			cancel()
   281  			fmt.Fprintf(os.Stderr, "\n%s\n", out)
   282  		case <-ticker:
   283  			fmt.Printf("%v runs so far, %v failures, over %s\n",
   284  				atomic.LoadInt32(&runs), atomic.LoadInt32(&fails),
   285  				roundToSeconds(timeutil.Since(startTime)))
   286  		case <-ctx.Done():
   287  			fmt.Printf("%v runs completed, %v failures, over %s\n",
   288  				atomic.LoadInt32(&runs), atomic.LoadInt32(&fails),
   289  				roundToSeconds(timeutil.Since(startTime)))
   290  
   291  			err := ctx.Err()
   292  			switch {
   293  			// A context timeout in this case is indicative of no failures
   294  			// being detected in the allotted duration.
   295  			case errors.Is(err, context.DeadlineExceeded):
   296  				return nil
   297  			case errors.Is(err, context.Canceled):
   298  				if *flagMaxRuns > 0 && int(atomic.LoadInt32(&runs)) >= *flagMaxRuns {
   299  					return nil
   300  				}
   301  				return err
   302  			default:
   303  				return fmt.Errorf("unexpected context error: %v", err)
   304  			}
   305  		}
   306  	}
   307  }
   308  
   309  func main() {
   310  	if err := run(); err != nil {
   311  		fmt.Fprintln(os.Stderr, err)
   312  		fmt.Println("FAIL")
   313  		os.Exit(1)
   314  	} else {
   315  		fmt.Println("SUCCESS")
   316  	}
   317  }