github.com/grailbio/bigslice@v0.0.0-20230519005545-30c4c12152ad/exec/chaosmonkey_test.go (about)

     1  // Copyright 2019 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package exec
     6  
     7  import (
     8  	"context"
     9  	"flag"
    10  	"log"
    11  	"math/rand"
    12  	"sync"
    13  	"testing"
    14  	"time"
    15  
    16  	"github.com/grailbio/base/retry"
    17  	"github.com/grailbio/bigmachine/testsystem"
    18  	"github.com/grailbio/bigslice"
    19  	"golang.org/x/sync/errgroup"
    20  )
    21  
    22  var chaos = flag.Bool("chaos", false, "run chaos monkey tests")
    23  
    24  // Victim is a bigslice.Func that produces a simple bigslice operation that
    25  // is about the simplest, sharded op that requires inter-node communication.
    26  // The victim op sleeps with random durations (exponentially distributed)
    27  // to give the chaos monkey time to act.
    28  var victim = bigslice.Func(func() (slice bigslice.Slice) {
    29  	data := make([]int, 1000)
    30  	for i := range data {
    31  		data[i] = rand.Int()
    32  	}
    33  	slice = bigslice.Const(10, data)
    34  	slice = bigslice.Map(slice, func(i int) (int, int) {
    35  		return i % 15, i
    36  	})
    37  	slice = bigslice.Reduce(slice, func(a, e int) int {
    38  		// This gives us on average 10s runs, when free of failures.
    39  		time.Sleep(time.Duration(60*rand.ExpFloat64()) * time.Millisecond)
    40  		return a + e
    41  	})
    42  	return
    43  })
    44  
    45  func TestChaosMonkey(t *testing.T) {
    46  	if testing.Short() {
    47  		t.Skip("chaos monkey tests disable with -short")
    48  	}
    49  	// The nature of this test is highly nondeterministic, and there are
    50  	// always corner cases that need to be handled still. Further, a failed
    51  	// test usually manifests in running forever. Currently the test is in
    52  	// place to test and exercise code paths manually, not as part of
    53  	// CI testing.
    54  	if !*chaos {
    55  		t.Skip("chaos monkey tests disabled; pass -chaos to enable")
    56  	}
    57  	// This test takes way too long to recover with the default probation timeouts.
    58  	save := ProbationTimeout
    59  	ProbationTimeout = time.Second
    60  	defer func() {
    61  		ProbationTimeout = save
    62  	}()
    63  	system := testsystem.New()
    64  	system.Machineprocs = 2
    65  	system.KeepalivePeriod = time.Second
    66  	system.KeepaliveTimeout = 2 * time.Second
    67  	system.KeepaliveRpcTimeout = time.Second
    68  
    69  	ctx, cancel := context.WithCancel(context.Background())
    70  	start := time.Now()
    71  	sess := Start(Bigmachine(system), Parallelism(10))
    72  	var g errgroup.Group
    73  	g.Go(func() error {
    74  		// Aggressively kill machines in the beginning, and then back off
    75  		// so that we have a chance to actually recover.
    76  		var (
    77  			wait        = time.Second
    78  			killerStart = time.Now()
    79  		)
    80  		for {
    81  			select {
    82  			case <-ctx.Done():
    83  				return nil
    84  			case <-time.After(wait):
    85  				wait += time.Duration(500+rand.Intn(2000)) * time.Millisecond
    86  				log.Printf("activating next chaos monkey in %s", wait)
    87  			}
    88  			if system.Kill(nil) {
    89  				log.Print("the simian army claimed yet another victim!")
    90  			}
    91  			if time.Since(killerStart) > time.Minute {
    92  				return nil
    93  			}
    94  		}
    95  	})
    96  	_, err := sess.Run(ctx, victim)
    97  	cancel()
    98  	t.Logf("victim ran in %s", time.Since(start))
    99  	if err != nil {
   100  		t.Error(err)
   101  	}
   102  	if err = g.Wait(); err != nil {
   103  		t.Fatal(err)
   104  	}
   105  }
   106  
   107  // TestDiscardChaos verifies that execution is robust to concurrent evaluation
   108  // and discarding. It does this by repeatedly concurrently evaluating and
   109  // discarding the same task graph and verifying that a final evaluation
   110  // produces correct results.
   111  func TestDiscardChaos(t *testing.T) {
   112  	if testing.Short() {
   113  		t.Skip("chaos monkey tests disable with -short")
   114  	}
   115  	if !*chaos {
   116  		t.Skip("chaos monkey tests disabled; pass -chaos to enable")
   117  	}
   118  	origEnableMaxConsecutiveLost := enableMaxConsecutiveLost
   119  	enableMaxConsecutiveLost = false
   120  	origRetryPolicy := retryPolicy
   121  	retryPolicy = retry.MaxRetries(retry.Backoff(100*time.Millisecond, 1*time.Second, 2), 5)
   122  	defer func() {
   123  		enableMaxConsecutiveLost = origEnableMaxConsecutiveLost
   124  		retryPolicy = origRetryPolicy
   125  	}()
   126  	const Nshard = 10
   127  	const N = Nshard * 100
   128  	// Niter is the number of stress test iterations. Each iteration
   129  	// concurrently runs and discards a task graph, then verifies that a final
   130  	// evaluation produces correct results.
   131  	const Niter = 5
   132  	f := bigslice.Func(func() bigslice.Slice {
   133  		vs := make([]int, N)
   134  		for i := range vs {
   135  			vs[i] = i
   136  		}
   137  		slice := bigslice.Const(Nshard, vs, append([]int{}, vs...))
   138  		slice = bigslice.Reduce(slice, func(x, y int) int {
   139  			time.Sleep(1 * time.Millisecond)
   140  			return x + y
   141  		})
   142  		return slice
   143  	})
   144  	id := bigslice.Func(func(result *Result) bigslice.Slice {
   145  		return result
   146  	})
   147  	for j := 0; j < Niter; j++ {
   148  		system := testsystem.New()
   149  		system.Machineprocs = 2
   150  		system.KeepalivePeriod = 500 * time.Millisecond
   151  		system.KeepaliveTimeout = 1 * time.Second
   152  		system.KeepaliveRpcTimeout = 500 * time.Millisecond
   153  		sess := Start(Bigmachine(system), Parallelism(8))
   154  		ctx := context.Background()
   155  		result, err := sess.Run(ctx, f)
   156  		if err != nil {
   157  			t.Fatal(err)
   158  		}
   159  		ctxRace, cancelRace := context.WithTimeout(ctx, 20*time.Second)
   160  		// Start two goroutines, one which continually evaluates and one
   161  		// which continually discards results.
   162  		var wg sync.WaitGroup
   163  		wg.Add(1)
   164  		go func() {
   165  			defer wg.Done()
   166  			wait := 10 * time.Millisecond
   167  			for {
   168  				select {
   169  				case <-ctxRace.Done():
   170  					return
   171  				case <-time.After(wait):
   172  					_, runErr := sess.Run(ctxRace, id, result)
   173  					if runErr != nil && ctxRace.Err() == nil {
   174  						t.Error(runErr)
   175  					}
   176  				}
   177  			}
   178  		}()
   179  		wg.Add(1)
   180  		go func() {
   181  			defer wg.Done()
   182  			wait := 10 * time.Millisecond
   183  			for {
   184  				select {
   185  				case <-ctxRace.Done():
   186  					return
   187  				case <-time.After(wait):
   188  					wait += time.Duration(rand.Intn(10)) * time.Millisecond
   189  					result.Discard(ctxRace)
   190  				}
   191  			}
   192  		}()
   193  		wg.Add(1)
   194  		go func() {
   195  			defer wg.Done()
   196  			wait := time.Duration(rand.Intn(250)) * time.Millisecond
   197  			for {
   198  				select {
   199  				case <-ctxRace.Done():
   200  					return
   201  				case <-time.After(wait):
   202  					wait += time.Duration(10+rand.Intn(250)) * time.Millisecond
   203  					log.Printf("activating next chaos monkey in %s", wait)
   204  					if system.Kill(nil) {
   205  						log.Print("the simian army claimed yet another victim!")
   206  					}
   207  				}
   208  			}
   209  		}()
   210  		wg.Wait()
   211  		cancelRace()
   212  		// Do one final evaluation, the result of which we verify for
   213  		// correctness.
   214  		result, err = sess.Run(ctx, id, result)
   215  		if err != nil {
   216  			t.Fatal(err)
   217  		}
   218  		s := result.Scanner()
   219  		x := rand.Int()
   220  		var count, i, j int
   221  		for s.Scan(ctx, &i, &j) {
   222  			count++
   223  			if i != j {
   224  				t.Error("result computed incorrectly")
   225  				break
   226  			}
   227  		}
   228  		if scanErr := s.Err(); scanErr != nil {
   229  			t.Fatal(scanErr)
   230  		}
   231  		if got, want := count, N; got != want {
   232  			t.Errorf("%v got %v, want %v", x, got, want)
   233  		}
   234  	}
   235  }