github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/contrib/jepsen/main.go (about) 1 /* 2 * Copyright 2019 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Runs Dgraph Jepsen tests with a local Dgraph binary. 18 // Set JEPSEN_ROOT environment variable before running. 19 // 20 // Example usage: 21 // 22 // Runs all test and nemesis combinations (36 total) 23 // ./jepsen --test-all 24 // 25 // Runs bank test with partition-ring nemesis for 10 minutes 26 // ./jepsen --jepsen.workload bank --jepsen.nemesis partition-ring 27 28 package main 29 30 import ( 31 "bytes" 32 "context" 33 "fmt" 34 "io" 35 "log" 36 "os" 37 "os/exec" 38 "strconv" 39 "strings" 40 "time" 41 42 "github.com/dgraph-io/dgraph/contrib/jepsen/browser" 43 "github.com/spf13/pflag" 44 ) 45 46 type jepsenTest struct { 47 workload string 48 nemesis string 49 timeLimit int 50 concurrency string 51 rebalanceInterval string 52 localBinary string 53 nodes string 54 skew string 55 testCount int 56 } 57 58 const ( 59 testPass = iota 60 testFail 61 testIncomplete 62 ) 63 64 var ( 65 availableWorkloads = []string{ 66 "bank", 67 "delete", 68 "long-fork", 69 "linearizable-register", 70 "uid-linearizable-register", 71 "upsert", 72 "set", 73 "uid-set", 74 "sequential", 75 } 76 availableNemeses = []string{ 77 "none", 78 "kill-alpha,kill-zero", 79 "partition-ring", 80 "move-tablet", 81 } 82 ) 83 84 var ( 85 ctxb = context.Background() 86 87 // Jepsen test flags 88 workload = pflag.StringP("workload", "w", "", 89 "Test workload to run.") 90 nemesis = pflag.StringP("nemesis", "n", "", 91 "A space-separated, comma-separated list of nemesis types.") 92 timeLimit = pflag.IntP("time-limit", "l", 600, 93 "Time limit per Jepsen test in seconds.") 94 concurrency = pflag.String("concurrency", "6n", 95 "Number of concurrent workers. \"6n\" means 6 workers per node.") 96 rebalanceInterval = pflag.String("rebalance-interval", "10h", 97 "Interval of Dgraph's tablet rebalancing.") 98 localBinary = pflag.StringP("local-binary", "b", "/gobin/dgraph", 99 "Path to Dgraph binary within the Jepsen control node.") 100 nodes = pflag.String("nodes", "n1,n2,n3,n4,n5", "Nodes to run on.") 101 skew = pflag.String("skew", "", "Skew clock amount. (tiny, small, big, huge)") 102 testCount = pflag.IntP("test-count", "c", 1, "Test count per Jepsen test.") 103 jaeger = pflag.StringP("jaeger", "j", "http://jaeger:14268", 104 "Run with Jaeger collector. Set to empty string to disable collection to Jaeger.") 105 106 // Jepsen control flags 107 doUp = pflag.BoolP("up", "u", true, "Run Jepsen ./up.sh.") 108 doUpOnly = pflag.BoolP("up-only", "U", false, "Do --up and exit.") 109 doDown = pflag.BoolP("down", "d", false, "Stop the Jepsen cluster after tests run.") 110 doDownOnly = pflag.BoolP("down-only", "D", false, "Do --down and exit. Does not run tests.") 111 doServe = pflag.Bool("serve", true, "Serve the test results page (lein run serve).") 112 web = pflag.Bool("web", true, "Open the test results page in the browser.") 113 114 // Script flags 115 dryRun = pflag.BoolP("dry-run", "y", false, 116 "Echo commands that would run, but don't execute them.") 117 ciOutput = pflag.BoolP("ci-output", "q", false, 118 "Output TeamCity test result directives instead of Jepsen test output.") 119 testAll = pflag.Bool("test-all", false, "Run all workload and nemesis combinations.") 120 ) 121 122 func command(cmd ...string) *exec.Cmd { 123 return commandContext(ctxb, cmd...) 124 } 125 126 func commandContext(ctx context.Context, cmd ...string) *exec.Cmd { 127 if *dryRun { 128 // Properly quote the args so the echoed output can run via copy/paste. 129 quoted := []string{} 130 for _, c := range cmd { 131 if strings.Contains(c, " ") { 132 quoted = append(quoted, strconv.Quote(c)) 133 } else { 134 quoted = append(quoted, c) 135 } 136 137 } 138 return exec.CommandContext(ctx, "echo", quoted...) 139 } 140 return exec.CommandContext(ctx, cmd[0], cmd[1:]...) 141 } 142 143 func jepsenUp() { 144 cmd := command("./up.sh", 145 "--dev", "--daemon", "--compose", "../dgraph/docker/docker-compose.yml") 146 cmd.Dir = os.Getenv("JEPSEN_ROOT") + "/docker/" 147 cmd.Stdout = os.Stdout 148 cmd.Stderr = os.Stderr 149 if err := cmd.Run(); err != nil { 150 log.Fatal(err) 151 } 152 } 153 154 func jepsenDown() { 155 cmd := command("docker-compose", 156 "-f", "./docker-compose.yml", 157 "-f", "../dgraph/docker/docker-compose.yml", 158 "down") 159 cmd.Dir = os.Getenv("JEPSEN_ROOT") + "/docker/" 160 cmd.Stdout = os.Stdout 161 cmd.Stderr = os.Stderr 162 if err := cmd.Run(); err != nil { 163 log.Fatal(err) 164 } 165 } 166 167 func jepsenServe() { 168 cmd := command( 169 "docker", "exec", "--workdir", "/jepsen/dgraph", "jepsen-control", 170 "lein", "run", "serve") 171 // Ignore output and errors. It's okay if "lein run serve" already ran before. 172 _ = cmd.Run() 173 } 174 175 func openJepsenBrowser() { 176 cmd := command( 177 "docker", "inspect", "--format", 178 `{{ (index (index .NetworkSettings.Ports "8080/tcp") 0).HostPort }}`, 179 "jepsen-control") 180 var out bytes.Buffer 181 cmd.Stdout = &out 182 if err := cmd.Run(); err != nil { 183 log.Fatal(err) 184 } 185 port := strings.TrimSpace(out.String()) 186 jepsenUrl := "http://localhost:" + port 187 browser.Open(jepsenUrl) 188 } 189 190 func runJepsenTest(test *jepsenTest) int { 191 dockerCmd := []string{ 192 "docker", "exec", "jepsen-control", 193 "/bin/bash", "-c", 194 } 195 testCmd := []string{ 196 // setup commands needed to set up ssh-agent to ssh into nodes. 197 "source", "~/.bashrc", "&&", 198 "cd", "/jepsen/dgraph", "&&", 199 // test commands 200 "lein", "run", "test", 201 "--workload", test.workload, 202 "--nemesis", test.nemesis, 203 "--time-limit", strconv.Itoa(test.timeLimit), 204 "--concurrency", test.concurrency, 205 "--rebalance-interval", test.rebalanceInterval, 206 "--local-binary", test.localBinary, 207 "--nodes", test.nodes, 208 "--test-count", strconv.Itoa(test.testCount), 209 } 210 if test.nemesis == "skew-clock" { 211 testCmd = append(testCmd, "--skew", test.skew) 212 } 213 if *jaeger != "" { 214 testCmd = append(testCmd, "--dgraph-jaeger-collector", *jaeger) 215 testCmd = append(testCmd, "--tracing", *jaeger+"/api/traces") 216 } 217 dockerCmd = append(dockerCmd, strings.Join(testCmd, " ")) 218 219 // Timeout should be a bit longer than the Jepsen test time limit to account 220 // for post-analysis time. 221 commandTimeout := 10*time.Minute + time.Duration(test.timeLimit)*time.Second 222 ctx, cancel := context.WithTimeout(ctxb, commandTimeout) 223 defer cancel() 224 cmd := commandContext(ctx, dockerCmd...) 225 226 var out bytes.Buffer 227 var stdout io.Writer 228 var stderr io.Writer 229 stdout = io.MultiWriter(&out, os.Stdout) 230 stderr = io.MultiWriter(&out, os.Stderr) 231 if inCi() { 232 // Jepsen test output to os.Stdout/os.Stderr is not needed in TeamCity. 233 stdout = &out 234 stderr = &out 235 } 236 cmd.Stdout = stdout 237 cmd.Stderr = stderr 238 239 if err := cmd.Run(); err != nil { 240 // TODO The exit code could probably be checked instead of checking the output. 241 // Check jepsen source to be sure. 242 if strings.Contains(out.String(), "Analysis invalid") { 243 return testFail 244 } 245 return testIncomplete 246 } 247 if strings.Contains(out.String(), "Everything looks good!") { 248 return testPass 249 } 250 return testIncomplete 251 } 252 253 func inCi() bool { 254 return *ciOutput || os.Getenv("TEAMCITY_VERSION") != "" 255 } 256 257 func tcStart(testName string) func(pass int) { 258 if !inCi() { 259 return func(int) {} 260 } 261 now := time.Now() 262 fmt.Printf("##teamcity[testStarted name='%v']\n", testName) 263 return func(pass int) { 264 durMs := time.Since(now).Nanoseconds() / 1e6 265 switch pass { 266 case testPass: 267 fmt.Printf("##teamcity[testFinished name='%v' duration='%v']\n", testName, durMs) 268 case testFail: 269 fmt.Printf("##teamcity[testFailed='%v' duration='%v']\n", testName, durMs) 270 case testIncomplete: 271 fmt.Printf("##teamcity[testFailed='%v' duration='%v' message='Test incomplete.']\n", 272 testName, durMs) 273 } 274 } 275 } 276 277 func main() { 278 pflag.Parse() 279 280 if os.Getenv("JEPSEN_ROOT") == "" { 281 log.Fatal("JEPSEN_ROOT must be set.") 282 } 283 if os.Getenv("GOPATH") == "" { 284 log.Fatal("GOPATH must be set.") 285 } 286 287 if *doDownOnly { 288 jepsenDown() 289 os.Exit(0) 290 } 291 if *doUpOnly { 292 jepsenUp() 293 os.Exit(0) 294 } 295 296 if *testAll { 297 *workload = strings.Join(availableWorkloads, " ") 298 *nemesis = strings.Join(availableNemeses, " ") 299 } 300 301 if *workload == "" || *nemesis == "" { 302 fmt.Printf("You must specify a workload and a nemesis.\n") 303 304 fmt.Printf("Available workloads:\n") 305 for _, w := range availableWorkloads { 306 fmt.Printf("\t%v\n", w) 307 } 308 fmt.Printf("Available nemeses:\n") 309 for _, n := range availableNemeses { 310 fmt.Printf("\t%v\n", n) 311 } 312 fmt.Printf("Example commands:\n") 313 fmt.Printf("$ %v -w bank -n none\n", os.Args[0]) 314 fmt.Printf("$ %v -w 'bank delete' -n 'none kill-alpha,kill-zero move-tablet'\n", os.Args[0]) 315 fmt.Printf("$ %v --test-all\n", os.Args[0]) 316 os.Exit(1) 317 } 318 319 if strings.Contains(*nemesis, "skew-clock") && *skew == "" { 320 log.Fatal("skew-clock nemesis specified but --jepsen.skew wasn't set.") 321 } 322 323 if *doUp { 324 jepsenUp() 325 } 326 if *doServe { 327 go jepsenServe() 328 if *web && !*dryRun { 329 openJepsenBrowser() 330 } 331 } 332 if *web && !*dryRun && *jaeger != "" { 333 // Open Jaeger UI 334 browser.Open("http://localhost:16686") 335 } 336 337 workloads := strings.Split(*workload, " ") 338 nemeses := strings.Split(*nemesis, " ") 339 fmt.Printf("Num tests: %v\n", len(workloads)*len(nemeses)) 340 for _, n := range nemeses { 341 for _, w := range workloads { 342 tcEnd := tcStart(fmt.Sprintf("Workload:%v,Nemeses:%v", w, n)) 343 status := runJepsenTest(&jepsenTest{ 344 workload: w, 345 nemesis: n, 346 timeLimit: *timeLimit, 347 concurrency: *concurrency, 348 rebalanceInterval: *rebalanceInterval, 349 localBinary: *localBinary, 350 nodes: *nodes, 351 skew: *skew, 352 testCount: *testCount, 353 }) 354 tcEnd(status) 355 } 356 } 357 358 if *doDown { 359 jepsenDown() 360 } 361 }