github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/gossip/convergence_test.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package gossip_test
    12  
    13  import (
    14  	"context"
    15  	"testing"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    18  	"github.com/cockroachdb/cockroach/pkg/gossip/simulation"
    19  	"github.com/cockroachdb/cockroach/pkg/testutils"
    20  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    21  	"github.com/cockroachdb/cockroach/pkg/util/log"
    22  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    23  )
    24  
    25  // The tests in this package have fairly small cluster sizes for the sake of
    26  // not taking too long to run when run as part of the normal unit tests. If
    27  // you're testing out gossip network behavior, you may find it useful to
    28  // increase the network size for these tests (adjusting the max thresholds
    29  // accordingly) and see how things behave.
    30  const (
    31  	testConvergenceSize        = 10
    32  	testReachesEquilibriumSize = 24
    33  )
    34  
    35  func connectionsRefused(network *simulation.Network) int64 {
    36  	var connsRefused int64
    37  	for _, node := range network.Nodes {
    38  		connsRefused += node.Gossip.GetNodeMetrics().ConnectionsRefused.Count()
    39  	}
    40  	return connsRefused
    41  }
    42  
    43  // TestConvergence verifies that a node gossip network converges within
    44  // a fixed number of simulation cycles. It's really difficult to
    45  // determine the right number for cycles because different things can
    46  // happen during a single cycle, depending on how much CPU time is
    47  // available. Eliminating this variability by getting more
    48  // synchronization primitives in place for the simulation is possible,
    49  // though two attempts so far have introduced more complexity into the
    50  // actual production gossip code than seems worthwhile for a unittest.
    51  // As such, the thresholds are drastically higher than is normally needed.
    52  //
    53  // As of Jan 2017, this normally takes ~12 cycles and 8-12 refused connections.
    54  func TestConvergence(t *testing.T) {
    55  	defer leaktest.AfterTest(t)()
    56  	if testutils.NightlyStress() {
    57  		t.Skip()
    58  	}
    59  
    60  	stopper := stop.NewStopper()
    61  	defer stopper.Stop(context.Background())
    62  
    63  	network := simulation.NewNetwork(stopper, testConvergenceSize, true, zonepb.DefaultZoneConfigRef())
    64  
    65  	const maxCycles = 100
    66  	if connectedCycle := network.RunUntilFullyConnected(); connectedCycle > maxCycles {
    67  		log.Warningf(context.Background(), "expected a fully-connected network within %d cycles; took %d",
    68  			maxCycles, connectedCycle)
    69  	}
    70  
    71  	const maxConnsRefused = 50
    72  	if connsRefused := connectionsRefused(network); connsRefused > maxConnsRefused {
    73  		log.Warningf(context.Background(),
    74  			"expected network to fully connect with <= %d connections refused; took %d",
    75  			maxConnsRefused, connsRefused)
    76  	}
    77  }
    78  
    79  // TestNetworkReachesEquilibrium ensures that the gossip network stops bouncing
    80  // refused connections around after a while and settles down.
    81  // As mentioned in the comment for TestConvergence, there is a large amount of
    82  // variability in how much gets done in each network cycle, and thus we have
    83  // to set thresholds that are drastically higher than is needed in the normal
    84  // case.
    85  //
    86  // As of Jan 2017, this normally takes 8-9 cycles and 50-60 refused connections.
    87  func TestNetworkReachesEquilibrium(t *testing.T) {
    88  	defer leaktest.AfterTest(t)()
    89  	if testutils.NightlyStress() {
    90  		t.Skip()
    91  	}
    92  
    93  	stopper := stop.NewStopper()
    94  	defer stopper.Stop(context.Background())
    95  
    96  	network := simulation.NewNetwork(stopper, testReachesEquilibriumSize, true, zonepb.DefaultZoneConfigRef())
    97  
    98  	var connsRefused int64
    99  	var cyclesWithoutChange int
   100  	var numCycles int
   101  	network.SimulateNetwork(func(cycle int, network *simulation.Network) bool {
   102  		numCycles = cycle
   103  		newConnsRefused := connectionsRefused(network)
   104  		if newConnsRefused > connsRefused {
   105  			connsRefused = newConnsRefused
   106  			cyclesWithoutChange = 0
   107  		} else {
   108  			cyclesWithoutChange++
   109  		}
   110  		if cycle%5 == 0 {
   111  			log.Infof(context.Background(), "cycle: %d, cyclesWithoutChange: %d, fullyConnected: %v",
   112  				cycle, cyclesWithoutChange, network.IsNetworkConnected())
   113  		}
   114  		return cyclesWithoutChange < 5
   115  	})
   116  
   117  	const maxCycles = 200
   118  	if numCycles > maxCycles {
   119  		log.Warningf(context.Background(), "expected a non-thrashing network within %d cycles; took %d",
   120  			maxCycles, numCycles)
   121  	}
   122  
   123  	const maxConnsRefused = 500
   124  	if connsRefused > maxConnsRefused {
   125  		log.Warningf(context.Background(),
   126  			"expected thrashing to die down with <= %d connections refused; took %d",
   127  			maxConnsRefused, connsRefused)
   128  	}
   129  }