github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/exec/chaosmonkey_test.go (about) 1 // Copyright 2019 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache 2.0 3 // license that can be found in the LICENSE file. 4 5 package exec 6 7 import ( 8 "context" 9 "flag" 10 "log" 11 "math/rand" 12 "sync" 13 "testing" 14 "time" 15 16 "github.com/grailbio/base/retry" 17 "github.com/grailbio/bigmachine/testsystem" 18 "github.com/grailbio/bigslice" 19 "golang.org/x/sync/errgroup" 20 ) 21 22 var chaos = flag.Bool("chaos", false, "run chaos monkey tests") 23 24 // Victim is a bigslice.Func that produces a simple bigslice operation that 25 // is about the simplest, sharded op that requires inter-node communication. 26 // The victim op sleeps with random durations (exponentially distributed) 27 // to give the chaos monkey time to act. 28 var victim = bigslice.Func(func() (slice bigslice.Slice) { 29 data := make([]int, 1000) 30 for i := range data { 31 data[i] = rand.Int() 32 } 33 slice = bigslice.Const(10, data) 34 slice = bigslice.Map(slice, func(i int) (int, int) { 35 return i % 15, i 36 }) 37 slice = bigslice.Reduce(slice, func(a, e int) int { 38 // This gives us on average 10s runs, when free of failures. 39 time.Sleep(time.Duration(60*rand.ExpFloat64()) * time.Millisecond) 40 return a + e 41 }) 42 return 43 }) 44 45 func TestChaosMonkey(t *testing.T) { 46 if testing.Short() { 47 t.Skip("chaos monkey tests disable with -short") 48 } 49 // The nature of this test is highly nondeterministic, and there are 50 // always corner cases that need to be handled still. Further, a failed 51 // test usually manifests in running forever. Currently the test is in 52 // place to test and exercise code paths manually, not as part of 53 // CI testing. 54 if !*chaos { 55 t.Skip("chaos monkey tests disabled; pass -chaos to enable") 56 } 57 // This test takes way too long to recover with the default probation timeouts. 58 save := ProbationTimeout 59 ProbationTimeout = time.Second 60 defer func() { 61 ProbationTimeout = save 62 }() 63 system := testsystem.New() 64 system.Machineprocs = 2 65 system.KeepalivePeriod = time.Second 66 system.KeepaliveTimeout = 2 * time.Second 67 system.KeepaliveRpcTimeout = time.Second 68 69 ctx, cancel := context.WithCancel(context.Background()) 70 start := time.Now() 71 sess := Start(Bigmachine(system), Parallelism(10)) 72 var g errgroup.Group 73 g.Go(func() error { 74 // Aggressively kill machines in the beginning, and then back off 75 // so that we have a chance to actually recover. 76 var ( 77 wait = time.Second 78 killerStart = time.Now() 79 ) 80 for { 81 select { 82 case <-ctx.Done(): 83 return nil 84 case <-time.After(wait): 85 wait += time.Duration(500+rand.Intn(2000)) * time.Millisecond 86 log.Printf("activating next chaos monkey in %s", wait) 87 } 88 if system.Kill(nil) { 89 log.Print("the simian army claimed yet another victim!") 90 } 91 if time.Since(killerStart) > time.Minute { 92 return nil 93 } 94 } 95 }) 96 _, err := sess.Run(ctx, victim) 97 cancel() 98 t.Logf("victim ran in %s", time.Since(start)) 99 if err != nil { 100 t.Error(err) 101 } 102 if err = g.Wait(); err != nil { 103 t.Fatal(err) 104 } 105 } 106 107 // TestDiscardChaos verifies that execution is robust to concurrent evaluation 108 // and discarding. It does this by repeatedly concurrently evaluating and 109 // discarding the same task graph and verifying that a final evaluation 110 // produces correct results. 111 func TestDiscardChaos(t *testing.T) { 112 if testing.Short() { 113 t.Skip("chaos monkey tests disable with -short") 114 } 115 if !*chaos { 116 t.Skip("chaos monkey tests disabled; pass -chaos to enable") 117 } 118 origEnableMaxConsecutiveLost := enableMaxConsecutiveLost 119 enableMaxConsecutiveLost = false 120 origRetryPolicy := retryPolicy 121 retryPolicy = retry.MaxRetries(retry.Backoff(100*time.Millisecond, 1*time.Second, 2), 5) 122 defer func() { 123 enableMaxConsecutiveLost = origEnableMaxConsecutiveLost 124 retryPolicy = origRetryPolicy 125 }() 126 const Nshard = 10 127 const N = Nshard * 100 128 // Niter is the number of stress test iterations. Each iteration 129 // concurrently runs and discards a task graph, then verifies that a final 130 // evaluation produces correct results. 131 const Niter = 5 132 f := bigslice.Func(func() bigslice.Slice { 133 vs := make([]int, N) 134 for i := range vs { 135 vs[i] = i 136 } 137 slice := bigslice.Const(Nshard, vs, append([]int{}, vs...)) 138 slice = bigslice.Reduce(slice, func(x, y int) int { 139 time.Sleep(1 * time.Millisecond) 140 return x + y 141 }) 142 return slice 143 }) 144 id := bigslice.Func(func(result *Result) bigslice.Slice { 145 return result 146 }) 147 for j := 0; j < Niter; j++ { 148 system := testsystem.New() 149 system.Machineprocs = 2 150 system.KeepalivePeriod = 500 * time.Millisecond 151 system.KeepaliveTimeout = 1 * time.Second 152 system.KeepaliveRpcTimeout = 500 * time.Millisecond 153 sess := Start(Bigmachine(system), Parallelism(8)) 154 ctx := context.Background() 155 result, err := sess.Run(ctx, f) 156 if err != nil { 157 t.Fatal(err) 158 } 159 ctxRace, cancelRace := context.WithTimeout(ctx, 20*time.Second) 160 // Start two goroutines, one which continually evaluates and one 161 // which continually discards results. 162 var wg sync.WaitGroup 163 wg.Add(1) 164 go func() { 165 defer wg.Done() 166 wait := 10 * time.Millisecond 167 for { 168 select { 169 case <-ctxRace.Done(): 170 return 171 case <-time.After(wait): 172 _, runErr := sess.Run(ctxRace, id, result) 173 if runErr != nil && ctxRace.Err() == nil { 174 t.Error(runErr) 175 } 176 } 177 } 178 }() 179 wg.Add(1) 180 go func() { 181 defer wg.Done() 182 wait := 10 * time.Millisecond 183 for { 184 select { 185 case <-ctxRace.Done(): 186 return 187 case <-time.After(wait): 188 wait += time.Duration(rand.Intn(10)) * time.Millisecond 189 result.Discard(ctxRace) 190 } 191 } 192 }() 193 wg.Add(1) 194 go func() { 195 defer wg.Done() 196 wait := time.Duration(rand.Intn(250)) * time.Millisecond 197 for { 198 select { 199 case <-ctxRace.Done(): 200 return 201 case <-time.After(wait): 202 wait += time.Duration(10+rand.Intn(250)) * time.Millisecond 203 log.Printf("activating next chaos monkey in %s", wait) 204 if system.Kill(nil) { 205 log.Print("the simian army claimed yet another victim!") 206 } 207 } 208 } 209 }() 210 wg.Wait() 211 cancelRace() 212 // Do one final evaluation, the result of which we verify for 213 // correctness. 214 result, err = sess.Run(ctx, id, result) 215 if err != nil { 216 t.Fatal(err) 217 } 218 s := result.Scanner() 219 x := rand.Int() 220 var count, i, j int 221 for s.Scan(ctx, &i, &j) { 222 count++ 223 if i != j { 224 t.Error("result computed incorrectly") 225 break 226 } 227 } 228 if scanErr := s.Err(); scanErr != nil { 229 t.Fatal(scanErr) 230 } 231 if got, want := count, N; got != want { 232 t.Errorf("%v got %v, want %v", x, got, want) 233 } 234 } 235 }