github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/contrib/jepsen/main.go (about)

     1  /*
     2   * Copyright 2019 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  // Runs Dgraph Jepsen tests with a local Dgraph binary.
    18  // Set JEPSEN_ROOT environment variable before running.
    19  //
    20  // Example usage:
    21  //
    22  // Runs all test and nemesis combinations (36 total)
    23  //     ./jepsen --test-all
    24  //
    25  // Runs bank test with partition-ring nemesis for 10 minutes
    26  //     ./jepsen --jepsen.workload bank --jepsen.nemesis partition-ring
    27  
    28  package main
    29  
    30  import (
    31  	"bytes"
    32  	"context"
    33  	"fmt"
    34  	"io"
    35  	"log"
    36  	"os"
    37  	"os/exec"
    38  	"strconv"
    39  	"strings"
    40  	"time"
    41  
    42  	"github.com/dgraph-io/dgraph/contrib/jepsen/browser"
    43  	"github.com/spf13/pflag"
    44  )
    45  
    46  type jepsenTest struct {
    47  	workload          string
    48  	nemesis           string
    49  	timeLimit         int
    50  	concurrency       string
    51  	rebalanceInterval string
    52  	localBinary       string
    53  	nodes             string
    54  	skew              string
    55  	testCount         int
    56  }
    57  
    58  const (
    59  	testPass = iota
    60  	testFail
    61  	testIncomplete
    62  )
    63  
    64  var (
    65  	availableWorkloads = []string{
    66  		"bank",
    67  		"delete",
    68  		"long-fork",
    69  		"linearizable-register",
    70  		"uid-linearizable-register",
    71  		"upsert",
    72  		"set",
    73  		"uid-set",
    74  		"sequential",
    75  	}
    76  	availableNemeses = []string{
    77  		"none",
    78  		"kill-alpha,kill-zero",
    79  		"partition-ring",
    80  		"move-tablet",
    81  	}
    82  )
    83  
    84  var (
    85  	ctxb = context.Background()
    86  
    87  	// Jepsen test flags
    88  	workload = pflag.StringP("workload", "w", "",
    89  		"Test workload to run.")
    90  	nemesis = pflag.StringP("nemesis", "n", "",
    91  		"A space-separated, comma-separated list of nemesis types.")
    92  	timeLimit = pflag.IntP("time-limit", "l", 600,
    93  		"Time limit per Jepsen test in seconds.")
    94  	concurrency = pflag.String("concurrency", "6n",
    95  		"Number of concurrent workers. \"6n\" means 6 workers per node.")
    96  	rebalanceInterval = pflag.String("rebalance-interval", "10h",
    97  		"Interval of Dgraph's tablet rebalancing.")
    98  	localBinary = pflag.StringP("local-binary", "b", "/gobin/dgraph",
    99  		"Path to Dgraph binary within the Jepsen control node.")
   100  	nodes     = pflag.String("nodes", "n1,n2,n3,n4,n5", "Nodes to run on.")
   101  	skew      = pflag.String("skew", "", "Skew clock amount. (tiny, small, big, huge)")
   102  	testCount = pflag.IntP("test-count", "c", 1, "Test count per Jepsen test.")
   103  	jaeger    = pflag.StringP("jaeger", "j", "http://jaeger:14268",
   104  		"Run with Jaeger collector. Set to empty string to disable collection to Jaeger.")
   105  
   106  	// Jepsen control flags
   107  	doUp       = pflag.BoolP("up", "u", true, "Run Jepsen ./up.sh.")
   108  	doUpOnly   = pflag.BoolP("up-only", "U", false, "Do --up and exit.")
   109  	doDown     = pflag.BoolP("down", "d", false, "Stop the Jepsen cluster after tests run.")
   110  	doDownOnly = pflag.BoolP("down-only", "D", false, "Do --down and exit. Does not run tests.")
   111  	doServe    = pflag.Bool("serve", true, "Serve the test results page (lein run serve).")
   112  	web        = pflag.Bool("web", true, "Open the test results page in the browser.")
   113  
   114  	// Script flags
   115  	dryRun = pflag.BoolP("dry-run", "y", false,
   116  		"Echo commands that would run, but don't execute them.")
   117  	ciOutput = pflag.BoolP("ci-output", "q", false,
   118  		"Output TeamCity test result directives instead of Jepsen test output.")
   119  	testAll = pflag.Bool("test-all", false, "Run all workload and nemesis combinations.")
   120  )
   121  
   122  func command(cmd ...string) *exec.Cmd {
   123  	return commandContext(ctxb, cmd...)
   124  }
   125  
   126  func commandContext(ctx context.Context, cmd ...string) *exec.Cmd {
   127  	if *dryRun {
   128  		// Properly quote the args so the echoed output can run via copy/paste.
   129  		quoted := []string{}
   130  		for _, c := range cmd {
   131  			if strings.Contains(c, " ") {
   132  				quoted = append(quoted, strconv.Quote(c))
   133  			} else {
   134  				quoted = append(quoted, c)
   135  			}
   136  
   137  		}
   138  		return exec.CommandContext(ctx, "echo", quoted...)
   139  	}
   140  	return exec.CommandContext(ctx, cmd[0], cmd[1:]...)
   141  }
   142  
   143  func jepsenUp() {
   144  	cmd := command("./up.sh",
   145  		"--dev", "--daemon", "--compose", "../dgraph/docker/docker-compose.yml")
   146  	cmd.Dir = os.Getenv("JEPSEN_ROOT") + "/docker/"
   147  	cmd.Stdout = os.Stdout
   148  	cmd.Stderr = os.Stderr
   149  	if err := cmd.Run(); err != nil {
   150  		log.Fatal(err)
   151  	}
   152  }
   153  
   154  func jepsenDown() {
   155  	cmd := command("docker-compose",
   156  		"-f", "./docker-compose.yml",
   157  		"-f", "../dgraph/docker/docker-compose.yml",
   158  		"down")
   159  	cmd.Dir = os.Getenv("JEPSEN_ROOT") + "/docker/"
   160  	cmd.Stdout = os.Stdout
   161  	cmd.Stderr = os.Stderr
   162  	if err := cmd.Run(); err != nil {
   163  		log.Fatal(err)
   164  	}
   165  }
   166  
   167  func jepsenServe() {
   168  	cmd := command(
   169  		"docker", "exec", "--workdir", "/jepsen/dgraph", "jepsen-control",
   170  		"lein", "run", "serve")
   171  	// Ignore output and errors. It's okay if "lein run serve" already ran before.
   172  	_ = cmd.Run()
   173  }
   174  
   175  func openJepsenBrowser() {
   176  	cmd := command(
   177  		"docker", "inspect", "--format",
   178  		`{{ (index (index .NetworkSettings.Ports "8080/tcp") 0).HostPort }}`,
   179  		"jepsen-control")
   180  	var out bytes.Buffer
   181  	cmd.Stdout = &out
   182  	if err := cmd.Run(); err != nil {
   183  		log.Fatal(err)
   184  	}
   185  	port := strings.TrimSpace(out.String())
   186  	jepsenUrl := "http://localhost:" + port
   187  	browser.Open(jepsenUrl)
   188  }
   189  
   190  func runJepsenTest(test *jepsenTest) int {
   191  	dockerCmd := []string{
   192  		"docker", "exec", "jepsen-control",
   193  		"/bin/bash", "-c",
   194  	}
   195  	testCmd := []string{
   196  		// setup commands needed to set up ssh-agent to ssh into nodes.
   197  		"source", "~/.bashrc", "&&",
   198  		"cd", "/jepsen/dgraph", "&&",
   199  		// test commands
   200  		"lein", "run", "test",
   201  		"--workload", test.workload,
   202  		"--nemesis", test.nemesis,
   203  		"--time-limit", strconv.Itoa(test.timeLimit),
   204  		"--concurrency", test.concurrency,
   205  		"--rebalance-interval", test.rebalanceInterval,
   206  		"--local-binary", test.localBinary,
   207  		"--nodes", test.nodes,
   208  		"--test-count", strconv.Itoa(test.testCount),
   209  	}
   210  	if test.nemesis == "skew-clock" {
   211  		testCmd = append(testCmd, "--skew", test.skew)
   212  	}
   213  	if *jaeger != "" {
   214  		testCmd = append(testCmd, "--dgraph-jaeger-collector", *jaeger)
   215  		testCmd = append(testCmd, "--tracing", *jaeger+"/api/traces")
   216  	}
   217  	dockerCmd = append(dockerCmd, strings.Join(testCmd, " "))
   218  
   219  	// Timeout should be a bit longer than the Jepsen test time limit to account
   220  	// for post-analysis time.
   221  	commandTimeout := 10*time.Minute + time.Duration(test.timeLimit)*time.Second
   222  	ctx, cancel := context.WithTimeout(ctxb, commandTimeout)
   223  	defer cancel()
   224  	cmd := commandContext(ctx, dockerCmd...)
   225  
   226  	var out bytes.Buffer
   227  	var stdout io.Writer
   228  	var stderr io.Writer
   229  	stdout = io.MultiWriter(&out, os.Stdout)
   230  	stderr = io.MultiWriter(&out, os.Stderr)
   231  	if inCi() {
   232  		// Jepsen test output to os.Stdout/os.Stderr is not needed in TeamCity.
   233  		stdout = &out
   234  		stderr = &out
   235  	}
   236  	cmd.Stdout = stdout
   237  	cmd.Stderr = stderr
   238  
   239  	if err := cmd.Run(); err != nil {
   240  		// TODO The exit code could probably be checked instead of checking the output.
   241  		// Check jepsen source to be sure.
   242  		if strings.Contains(out.String(), "Analysis invalid") {
   243  			return testFail
   244  		}
   245  		return testIncomplete
   246  	}
   247  	if strings.Contains(out.String(), "Everything looks good!") {
   248  		return testPass
   249  	}
   250  	return testIncomplete
   251  }
   252  
   253  func inCi() bool {
   254  	return *ciOutput || os.Getenv("TEAMCITY_VERSION") != ""
   255  }
   256  
   257  func tcStart(testName string) func(pass int) {
   258  	if !inCi() {
   259  		return func(int) {}
   260  	}
   261  	now := time.Now()
   262  	fmt.Printf("##teamcity[testStarted name='%v']\n", testName)
   263  	return func(pass int) {
   264  		durMs := time.Since(now).Nanoseconds() / 1e6
   265  		switch pass {
   266  		case testPass:
   267  			fmt.Printf("##teamcity[testFinished name='%v' duration='%v']\n", testName, durMs)
   268  		case testFail:
   269  			fmt.Printf("##teamcity[testFailed='%v' duration='%v']\n", testName, durMs)
   270  		case testIncomplete:
   271  			fmt.Printf("##teamcity[testFailed='%v' duration='%v' message='Test incomplete.']\n",
   272  				testName, durMs)
   273  		}
   274  	}
   275  }
   276  
   277  func main() {
   278  	pflag.Parse()
   279  
   280  	if os.Getenv("JEPSEN_ROOT") == "" {
   281  		log.Fatal("JEPSEN_ROOT must be set.")
   282  	}
   283  	if os.Getenv("GOPATH") == "" {
   284  		log.Fatal("GOPATH must be set.")
   285  	}
   286  
   287  	if *doDownOnly {
   288  		jepsenDown()
   289  		os.Exit(0)
   290  	}
   291  	if *doUpOnly {
   292  		jepsenUp()
   293  		os.Exit(0)
   294  	}
   295  
   296  	if *testAll {
   297  		*workload = strings.Join(availableWorkloads, " ")
   298  		*nemesis = strings.Join(availableNemeses, " ")
   299  	}
   300  
   301  	if *workload == "" || *nemesis == "" {
   302  		fmt.Printf("You must specify a workload and a nemesis.\n")
   303  
   304  		fmt.Printf("Available workloads:\n")
   305  		for _, w := range availableWorkloads {
   306  			fmt.Printf("\t%v\n", w)
   307  		}
   308  		fmt.Printf("Available nemeses:\n")
   309  		for _, n := range availableNemeses {
   310  			fmt.Printf("\t%v\n", n)
   311  		}
   312  		fmt.Printf("Example commands:\n")
   313  		fmt.Printf("$ %v -w bank -n none\n", os.Args[0])
   314  		fmt.Printf("$ %v -w 'bank delete' -n 'none kill-alpha,kill-zero move-tablet'\n", os.Args[0])
   315  		fmt.Printf("$ %v --test-all\n", os.Args[0])
   316  		os.Exit(1)
   317  	}
   318  
   319  	if strings.Contains(*nemesis, "skew-clock") && *skew == "" {
   320  		log.Fatal("skew-clock nemesis specified but --jepsen.skew wasn't set.")
   321  	}
   322  
   323  	if *doUp {
   324  		jepsenUp()
   325  	}
   326  	if *doServe {
   327  		go jepsenServe()
   328  		if *web && !*dryRun {
   329  			openJepsenBrowser()
   330  		}
   331  	}
   332  	if *web && !*dryRun && *jaeger != "" {
   333  		// Open Jaeger UI
   334  		browser.Open("http://localhost:16686")
   335  	}
   336  
   337  	workloads := strings.Split(*workload, " ")
   338  	nemeses := strings.Split(*nemesis, " ")
   339  	fmt.Printf("Num tests: %v\n", len(workloads)*len(nemeses))
   340  	for _, n := range nemeses {
   341  		for _, w := range workloads {
   342  			tcEnd := tcStart(fmt.Sprintf("Workload:%v,Nemeses:%v", w, n))
   343  			status := runJepsenTest(&jepsenTest{
   344  				workload:          w,
   345  				nemesis:           n,
   346  				timeLimit:         *timeLimit,
   347  				concurrency:       *concurrency,
   348  				rebalanceInterval: *rebalanceInterval,
   349  				localBinary:       *localBinary,
   350  				nodes:             *nodes,
   351  				skew:              *skew,
   352  				testCount:         *testCount,
   353  			})
   354  			tcEnd(status)
   355  		}
   356  	}
   357  
   358  	if *doDown {
   359  		jepsenDown()
   360  	}
   361  }