github.com/mailgun/holster/v4@v4.20.0/election/election_test.go (about)

     1  package election_test
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"testing"
     8  	"time"
     9  
    10  	"github.com/mailgun/holster/v4/election"
    11  	"github.com/mailgun/holster/v4/slice"
    12  	"github.com/mailgun/holster/v4/testutil"
    13  	"github.com/sirupsen/logrus"
    14  	"github.com/stretchr/testify/assert"
    15  	"github.com/stretchr/testify/require"
    16  )
    17  
    18  var (
    19  	cfg            *election.Config
    20  	ErrConnRefused = errors.New("connection refused")
    21  )
    22  
    23  func init() {
    24  	logrus.SetLevel(logrus.DebugLevel)
    25  	cfg = &election.Config{
    26  		NetworkTimeout:      time.Second,
    27  		HeartBeatTimeout:    time.Second,
    28  		LeaderQuorumTimeout: time.Second * 2,
    29  		ElectionTimeout:     time.Second * 2,
    30  	}
    31  }
    32  
    33  func createCluster(t *testing.T, c *TestCluster) {
    34  	t.Helper()
    35  
    36  	// Start with a known leader
    37  	err := c.SpawnNode("n0", cfg)
    38  	require.NoError(t, err)
    39  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
    40  		status := c.GetClusterStatus()
    41  		assert.Equal(t, ClusterStatus{
    42  			"n0": "n0",
    43  		}, status)
    44  	})
    45  
    46  	// Added nodes should become followers
    47  	err = c.SpawnNode("n1", cfg)
    48  	require.NoError(t, err)
    49  	err = c.SpawnNode("n2", cfg)
    50  	require.NoError(t, err)
    51  	err = c.SpawnNode("n3", cfg)
    52  	require.NoError(t, err)
    53  	err = c.SpawnNode("n4", cfg)
    54  	require.NoError(t, err)
    55  
    56  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
    57  		status := c.GetClusterStatus()
    58  		assert.Equal(t, ClusterStatus{
    59  			"n0": "n0",
    60  			"n1": "n0",
    61  			"n2": "n0",
    62  			"n3": "n0",
    63  			"n4": "n0",
    64  		}, status)
    65  	})
    66  }
    67  
    68  func TestSingleNodeLeader(t *testing.T) {
    69  	c := NewTestCluster(t)
    70  	err := c.SpawnNode("n0", cfg)
    71  	require.NoError(t, err)
    72  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
    73  		status := c.GetClusterStatus()
    74  		assert.Equal(t, ClusterStatus{
    75  			"n0": "n0",
    76  		}, status)
    77  	})
    78  
    79  	// Consume first leader election event
    80  	event := <-c.OnChangeCh
    81  	assert.Equal(t, "n0", event.Leader)
    82  	assert.Equal(t, "n0", event.From)
    83  
    84  	assert.True(t, c.Nodes["n0"].Node.IsLeader())
    85  
    86  	select {
    87  	// Should NOT receive a leadership change as we are the only node
    88  	case <-c.OnChangeCh:
    89  		t.Log("received un-expected leader change")
    90  		t.FailNow()
    91  	case <-time.After(cfg.HeartBeatTimeout * 3):
    92  	}
    93  }
    94  
    95  func TestSimpleElection(t *testing.T) {
    96  	c := NewTestCluster(t)
    97  	createCluster(t, c)
    98  	defer c.Close()
    99  
   100  	err := c.Nodes["n0"].Node.Resign(context.Background())
   101  	require.NoError(t, err)
   102  
   103  	// Wait until n0 is no longer leader
   104  	testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) {
   105  		candidate := c.GetLeader()
   106  		if !assert.NotNil(t, candidate) {
   107  			return
   108  		}
   109  		assert.NotEqual(t, "n0", candidate.GetLeader())
   110  	})
   111  
   112  	for k, v := range c.Nodes {
   113  		t.Logf("Node: %s Leader: %t\n", k, v.Node.IsLeader())
   114  	}
   115  }
   116  
   117  func TestLeaderDisconnect(t *testing.T) {
   118  	c := NewTestCluster(t)
   119  	createCluster(t, c)
   120  	defer c.Close()
   121  
   122  	c.AddNetworkError("n0", ErrConnRefused)
   123  	defer c.DelNetworkError("n0")
   124  
   125  	// Should lose leadership
   126  	testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) {
   127  		node := c.Nodes["n0"]
   128  		if !assert.NotNil(t, node.Node) {
   129  			return
   130  		}
   131  		assert.NotEqual(t, "n0", node.Node.GetLeader())
   132  	})
   133  
   134  	for k, v := range c.Nodes {
   135  		t.Logf("Node: %s Leader: %t\n", k, v.Node.IsLeader())
   136  	}
   137  }
   138  
   139  func TestFollowerDisconnect(t *testing.T) {
   140  	c := NewTestCluster(t)
   141  	createCluster(t, c)
   142  	defer c.Close()
   143  
   144  	c.AddNetworkError("n4", ErrConnRefused)
   145  	defer c.DelNetworkError("n4")
   146  
   147  	// Wait until n4 loses leader
   148  	testutil.UntilPass(t, 5, time.Second, func(t testutil.TestingT) {
   149  		status := c.GetClusterStatus()
   150  		assert.NotEqual(t, "n0", status["n4"])
   151  	})
   152  
   153  	c.DelNetworkError("n4")
   154  
   155  	// Follower should resume being a follower without forcing a new election.
   156  	testutil.UntilPass(t, 60, time.Second, func(t testutil.TestingT) {
   157  		status := c.GetClusterStatus()
   158  		assert.Equal(t, "n0", status["n4"])
   159  	})
   160  }
   161  
   162  func TestSplitBrain(t *testing.T) {
   163  	c1 := NewTestCluster(t)
   164  	createCluster(t, c1)
   165  	defer c1.Close()
   166  
   167  	c2 := NewTestCluster(t)
   168  
   169  	// Now take 2 nodes from cluster 1 and put them in their own cluster.
   170  	// This causes n0 to lose contact with n2-n4 and should update the member list
   171  	// such that n0 only knows about n1.
   172  
   173  	// Since n0 was leader previously, it should remain leader
   174  	c2.Add("n0", c1.Remove("n0"))
   175  	c2.Add("n1", c1.Remove("n1"))
   176  
   177  	// Cluster 1 should elect a new leader
   178  	testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) {
   179  		assert.NotNil(t, c1.GetLeader())
   180  	})
   181  
   182  	for k, v := range c1.Nodes {
   183  		t.Logf("C1 Node: %s Leader: %t\n", k, v.Node.IsLeader())
   184  	}
   185  
   186  	// Cluster 2 should elect a new leader
   187  	testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) {
   188  		assert.NotNil(t, c2.GetLeader())
   189  	})
   190  
   191  	for k, v := range c2.Nodes {
   192  		t.Logf("C2 Node: %s Leader: %t\n", k, v.Node.IsLeader())
   193  	}
   194  
   195  	// Move the nodes in cluster2, back to the cluster1
   196  	c1.Add("n0", c2.Remove("n0"))
   197  	c1.Add("n1", c2.Remove("n1"))
   198  
   199  	// The nodes should detect 2 leaders and start a new vote.
   200  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
   201  		status := c1.GetClusterStatus()
   202  		var leaders []string
   203  		for _, v := range status {
   204  			if slice.ContainsString(v, leaders, nil) {
   205  				continue
   206  			}
   207  			leaders = append(leaders, v)
   208  		}
   209  		if !assert.NotNil(t, leaders) {
   210  			return
   211  		}
   212  		assert.Equal(t, 1, len(leaders))
   213  		assert.NotEmpty(t, leaders[0])
   214  	})
   215  
   216  	for k, v := range c1.Nodes {
   217  		t.Logf("Node: %s Leader: %t\n", k, v.Node.IsLeader())
   218  	}
   219  }
   220  
   221  func TestOmissionFaults(t *testing.T) {
   222  	c1 := NewTestCluster(t)
   223  	createCluster(t, c1)
   224  	defer c1.Close()
   225  
   226  	// Create an unstable cluster with n3 and n4 only able to contact n1 and n2 respectively.
   227  	// The end result should be an omission fault of less than quorum.
   228  	//
   229  	// Diagram: lines indicate connectivity between nodes
   230  	// (n0)-----(n1)----(n4)
   231  	//   \       /
   232  	//	  \     /
   233  	//     \   /
   234  	//      (n2)----(n3)
   235  	//
   236  
   237  	// n3 and n4 can't talk
   238  	c1.Disconnect("n3", "n4", ErrConnRefused)
   239  	c1.Disconnect("n4", "n3", ErrConnRefused)
   240  
   241  	// Leader can't talk to n4
   242  	c1.Disconnect("n0", "n4", ErrConnRefused)
   243  	c1.Disconnect("n4", "n0", ErrConnRefused)
   244  
   245  	// Leader can't talk to n3
   246  	c1.Disconnect("n0", "n3", ErrConnRefused)
   247  	c1.Disconnect("n3", "n0", ErrConnRefused)
   248  
   249  	// n2 and n4 can't talk
   250  	c1.Disconnect("n2", "n4", ErrConnRefused)
   251  	c1.Disconnect("n4", "n2", ErrConnRefused)
   252  
   253  	// n1 and n3 can't talk
   254  	c1.Disconnect("n1", "n3", ErrConnRefused)
   255  	c1.Disconnect("n3", "n1", ErrConnRefused)
   256  
   257  	// Cluster should retain n0 as leader in the face on unstable cluster
   258  	for i := 0; i < 12; i++ {
   259  		leader := c1.GetLeader()
   260  		require.NotNil(t, leader)
   261  		require.Equal(t, leader.GetLeader(), "n0")
   262  		time.Sleep(time.Millisecond * 400)
   263  	}
   264  
   265  	// Should retain leader once communication is restored
   266  	c1.ClearErrors()
   267  
   268  	for i := 0; i < 12; i++ {
   269  		leader := c1.GetLeader()
   270  		require.NotNil(t, leader)
   271  		require.Equal(t, leader.GetLeader(), "n0")
   272  		time.Sleep(time.Millisecond * 400)
   273  	}
   274  }
   275  
   276  func TestIsolatedLeader(t *testing.T) {
   277  	c1 := NewTestCluster(t)
   278  	createCluster(t, c1)
   279  	defer c1.Close()
   280  
   281  	// Create a cluster where the leader become isolated from the rest
   282  	// of the cluster.
   283  	//
   284  	// Diagram: lines indicate connectivity
   285  	// between nodes and n0 is leader
   286  	//
   287  	// (n0)----(n1)----(n4)
   288  	//          / \     /
   289  	//	       /   \   /
   290  	//        /     \ /
   291  	//      (n2)----(n3)
   292  	//
   293  	require.Equal(t, c1.GetLeader().GetLeader(), "n0")
   294  
   295  	// Leader can't talk to n2
   296  	c1.Disconnect("n0", "n2", ErrConnRefused)
   297  	c1.Disconnect("n2", "n0", ErrConnRefused)
   298  
   299  	// Leader can't talk to n3
   300  	c1.Disconnect("n0", "n3", ErrConnRefused)
   301  	c1.Disconnect("n3", "n0", ErrConnRefused)
   302  
   303  	// Leader can't talk to n4
   304  	c1.Disconnect("n0", "n4", ErrConnRefused)
   305  	c1.Disconnect("n4", "n0", ErrConnRefused)
   306  
   307  	// Leader should realize it doesn't have a quorum of
   308  	// heartbeats and step down and remaining cluster should
   309  	// elect a new leader
   310  	for i := 0; i < 20; i++ {
   311  		leader := c1.GetLeader()
   312  		if leader == nil {
   313  			goto sleep
   314  		}
   315  
   316  		// Leader should no longer be n0
   317  		if leader.GetLeader() != "n0" {
   318  			// A node in the new cluster must agree and have elected a new leader
   319  			l := c1.Nodes["n4"].Node.GetLeader()
   320  			if l != "" && l == "n0" {
   321  				break
   322  			}
   323  		}
   324  	sleep:
   325  		time.Sleep(time.Millisecond * 500)
   326  	}
   327  	require.NotNil(t, c1.GetLeader())
   328  	require.NotEqual(t, c1.GetLeader().GetLeader(), "n0")
   329  	// Note: In the case where n1 is elected the new leader,
   330  	// n0 will know that n1 is the new leader sooner than later
   331  	// since connectivity from n0 to n1 was never interrupted.
   332  	// fmt.Printf("Cluster: %#v\n", c1.GetClusterStatus())
   333  
   334  	// Should persist new leader once communication is restored
   335  	c1.ClearErrors()
   336  
   337  	// Should pick up the leadership from the rest of the cluster
   338  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
   339  		leader := c1.Nodes["n0"].Node.GetLeader()
   340  		assert.NotEqual(t, leader, "")
   341  	})
   342  
   343  	s, err := c1.Nodes["n0"].Node.GetState(context.Background())
   344  	fmt.Printf("State: %#v\n", s)
   345  	require.NoError(t, err)
   346  	assert.Equal(t, "Follower", s.State)
   347  }
   348  
   349  func TestMinimumQuorum(t *testing.T) {
   350  	c := NewTestCluster(t)
   351  
   352  	cfg := &election.Config{
   353  		NetworkTimeout:      time.Second,
   354  		HeartBeatTimeout:    time.Second,
   355  		LeaderQuorumTimeout: time.Second * 2,
   356  		ElectionTimeout:     time.Second * 2,
   357  		MinimumQuorum:       2,
   358  	}
   359  
   360  	err := c.SpawnNode("n0", cfg)
   361  	require.NoError(t, err)
   362  
   363  	time.Sleep(time.Second * 5)
   364  
   365  	// Ensure n0 is not leader
   366  	status := c.GetClusterStatus()
   367  	require.NotEqual(t, "n0", status["n0"])
   368  
   369  	err = c.SpawnNode("n1", cfg)
   370  	require.NoError(t, err)
   371  
   372  	// Should elect a leader
   373  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
   374  		status := c.GetClusterStatus()
   375  		assert.NotEqual(t, status["n0"], "")
   376  	})
   377  
   378  	status = c.GetClusterStatus()
   379  	var leader string
   380  
   381  	// Shutdown the follower
   382  	if status["n0"] == "n0" {
   383  		err = c.Remove("n1").Node.Stop(context.Background())
   384  		require.NoError(t, err)
   385  		leader = "n0"
   386  	} else {
   387  		err = c.Remove("n0").Node.Stop(context.Background())
   388  		require.NoError(t, err)
   389  		leader = "n1"
   390  	}
   391  
   392  	// The leader should detect it no longer has MinimumQuorum and step down
   393  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
   394  		status := c.GetClusterStatus()
   395  		assert.Equal(t, status[leader], "")
   396  	})
   397  }
   398  
   399  func TestResign(t *testing.T) {
   400  	c1 := NewTestCluster(t)
   401  	createCluster(t, c1)
   402  	defer c1.Close()
   403  
   404  	testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) {
   405  		assert.NotNil(t, c1.GetLeader())
   406  	})
   407  
   408  	leader := c1.GetLeader()
   409  
   410  	// Calling resign on a follower should have no effect
   411  	err := c1.Nodes["n1"].Node.Resign(context.Background())
   412  	assert.ErrorContains(t, err, "not the leader")
   413  
   414  	for i := 0; i < 10; i++ {
   415  		if c1.GetLeader() != leader {
   416  			require.FailNow(t, "leader should not have changed")
   417  		}
   418  		time.Sleep(time.Millisecond * 500)
   419  	}
   420  	// Calling resign on the leader should give up leader
   421  	err = c1.Nodes["n0"].Node.Resign(context.Background())
   422  	require.NoError(t, err)
   423  
   424  	testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) {
   425  		assert.NotEqual(t, leader, c1.GetLeader())
   426  	})
   427  }
   428  
   429  func TestResignSingleNode(t *testing.T) {
   430  	c := NewTestCluster(t)
   431  	err := c.SpawnNode("n0", cfg)
   432  	require.NoError(t, err)
   433  	defer c.Close()
   434  
   435  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
   436  		status := c.GetClusterStatus()
   437  		assert.Equal(t, ClusterStatus{
   438  			"n0": "n0",
   439  		}, status)
   440  	})
   441  
   442  	err = c.Nodes["n0"].Node.Resign(context.Background())
   443  	require.NoError(t, err)
   444  
   445  	// n0 will eventually become leader again
   446  	testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) {
   447  		status := c.GetClusterStatus()
   448  		assert.Equal(t, ClusterStatus{
   449  			"n0": "n0",
   450  		}, status)
   451  	})
   452  }