github.com/mailgun/holster/v4@v4.20.0/election/election_test.go (about) 1 package election_test 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "testing" 8 "time" 9 10 "github.com/mailgun/holster/v4/election" 11 "github.com/mailgun/holster/v4/slice" 12 "github.com/mailgun/holster/v4/testutil" 13 "github.com/sirupsen/logrus" 14 "github.com/stretchr/testify/assert" 15 "github.com/stretchr/testify/require" 16 ) 17 18 var ( 19 cfg *election.Config 20 ErrConnRefused = errors.New("connection refused") 21 ) 22 23 func init() { 24 logrus.SetLevel(logrus.DebugLevel) 25 cfg = &election.Config{ 26 NetworkTimeout: time.Second, 27 HeartBeatTimeout: time.Second, 28 LeaderQuorumTimeout: time.Second * 2, 29 ElectionTimeout: time.Second * 2, 30 } 31 } 32 33 func createCluster(t *testing.T, c *TestCluster) { 34 t.Helper() 35 36 // Start with a known leader 37 err := c.SpawnNode("n0", cfg) 38 require.NoError(t, err) 39 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 40 status := c.GetClusterStatus() 41 assert.Equal(t, ClusterStatus{ 42 "n0": "n0", 43 }, status) 44 }) 45 46 // Added nodes should become followers 47 err = c.SpawnNode("n1", cfg) 48 require.NoError(t, err) 49 err = c.SpawnNode("n2", cfg) 50 require.NoError(t, err) 51 err = c.SpawnNode("n3", cfg) 52 require.NoError(t, err) 53 err = c.SpawnNode("n4", cfg) 54 require.NoError(t, err) 55 56 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 57 status := c.GetClusterStatus() 58 assert.Equal(t, ClusterStatus{ 59 "n0": "n0", 60 "n1": "n0", 61 "n2": "n0", 62 "n3": "n0", 63 "n4": "n0", 64 }, status) 65 }) 66 } 67 68 func TestSingleNodeLeader(t *testing.T) { 69 c := NewTestCluster(t) 70 err := c.SpawnNode("n0", cfg) 71 require.NoError(t, err) 72 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 73 status := c.GetClusterStatus() 74 assert.Equal(t, ClusterStatus{ 75 "n0": "n0", 76 }, status) 77 }) 78 79 // Consume first leader election event 80 event := <-c.OnChangeCh 81 assert.Equal(t, "n0", event.Leader) 82 assert.Equal(t, "n0", event.From) 83 84 assert.True(t, c.Nodes["n0"].Node.IsLeader()) 85 86 select { 87 // Should NOT receive a leadership change as we are the only node 88 case <-c.OnChangeCh: 89 t.Log("received un-expected leader change") 90 t.FailNow() 91 case <-time.After(cfg.HeartBeatTimeout * 3): 92 } 93 } 94 95 func TestSimpleElection(t *testing.T) { 96 c := NewTestCluster(t) 97 createCluster(t, c) 98 defer c.Close() 99 100 err := c.Nodes["n0"].Node.Resign(context.Background()) 101 require.NoError(t, err) 102 103 // Wait until n0 is no longer leader 104 testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) { 105 candidate := c.GetLeader() 106 if !assert.NotNil(t, candidate) { 107 return 108 } 109 assert.NotEqual(t, "n0", candidate.GetLeader()) 110 }) 111 112 for k, v := range c.Nodes { 113 t.Logf("Node: %s Leader: %t\n", k, v.Node.IsLeader()) 114 } 115 } 116 117 func TestLeaderDisconnect(t *testing.T) { 118 c := NewTestCluster(t) 119 createCluster(t, c) 120 defer c.Close() 121 122 c.AddNetworkError("n0", ErrConnRefused) 123 defer c.DelNetworkError("n0") 124 125 // Should lose leadership 126 testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) { 127 node := c.Nodes["n0"] 128 if !assert.NotNil(t, node.Node) { 129 return 130 } 131 assert.NotEqual(t, "n0", node.Node.GetLeader()) 132 }) 133 134 for k, v := range c.Nodes { 135 t.Logf("Node: %s Leader: %t\n", k, v.Node.IsLeader()) 136 } 137 } 138 139 func TestFollowerDisconnect(t *testing.T) { 140 c := NewTestCluster(t) 141 createCluster(t, c) 142 defer c.Close() 143 144 c.AddNetworkError("n4", ErrConnRefused) 145 defer c.DelNetworkError("n4") 146 147 // Wait until n4 loses leader 148 testutil.UntilPass(t, 5, time.Second, func(t testutil.TestingT) { 149 status := c.GetClusterStatus() 150 assert.NotEqual(t, "n0", status["n4"]) 151 }) 152 153 c.DelNetworkError("n4") 154 155 // Follower should resume being a follower without forcing a new election. 156 testutil.UntilPass(t, 60, time.Second, func(t testutil.TestingT) { 157 status := c.GetClusterStatus() 158 assert.Equal(t, "n0", status["n4"]) 159 }) 160 } 161 162 func TestSplitBrain(t *testing.T) { 163 c1 := NewTestCluster(t) 164 createCluster(t, c1) 165 defer c1.Close() 166 167 c2 := NewTestCluster(t) 168 169 // Now take 2 nodes from cluster 1 and put them in their own cluster. 170 // This causes n0 to lose contact with n2-n4 and should update the member list 171 // such that n0 only knows about n1. 172 173 // Since n0 was leader previously, it should remain leader 174 c2.Add("n0", c1.Remove("n0")) 175 c2.Add("n1", c1.Remove("n1")) 176 177 // Cluster 1 should elect a new leader 178 testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) { 179 assert.NotNil(t, c1.GetLeader()) 180 }) 181 182 for k, v := range c1.Nodes { 183 t.Logf("C1 Node: %s Leader: %t\n", k, v.Node.IsLeader()) 184 } 185 186 // Cluster 2 should elect a new leader 187 testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) { 188 assert.NotNil(t, c2.GetLeader()) 189 }) 190 191 for k, v := range c2.Nodes { 192 t.Logf("C2 Node: %s Leader: %t\n", k, v.Node.IsLeader()) 193 } 194 195 // Move the nodes in cluster2, back to the cluster1 196 c1.Add("n0", c2.Remove("n0")) 197 c1.Add("n1", c2.Remove("n1")) 198 199 // The nodes should detect 2 leaders and start a new vote. 200 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 201 status := c1.GetClusterStatus() 202 var leaders []string 203 for _, v := range status { 204 if slice.ContainsString(v, leaders, nil) { 205 continue 206 } 207 leaders = append(leaders, v) 208 } 209 if !assert.NotNil(t, leaders) { 210 return 211 } 212 assert.Equal(t, 1, len(leaders)) 213 assert.NotEmpty(t, leaders[0]) 214 }) 215 216 for k, v := range c1.Nodes { 217 t.Logf("Node: %s Leader: %t\n", k, v.Node.IsLeader()) 218 } 219 } 220 221 func TestOmissionFaults(t *testing.T) { 222 c1 := NewTestCluster(t) 223 createCluster(t, c1) 224 defer c1.Close() 225 226 // Create an unstable cluster with n3 and n4 only able to contact n1 and n2 respectively. 227 // The end result should be an omission fault of less than quorum. 228 // 229 // Diagram: lines indicate connectivity between nodes 230 // (n0)-----(n1)----(n4) 231 // \ / 232 // \ / 233 // \ / 234 // (n2)----(n3) 235 // 236 237 // n3 and n4 can't talk 238 c1.Disconnect("n3", "n4", ErrConnRefused) 239 c1.Disconnect("n4", "n3", ErrConnRefused) 240 241 // Leader can't talk to n4 242 c1.Disconnect("n0", "n4", ErrConnRefused) 243 c1.Disconnect("n4", "n0", ErrConnRefused) 244 245 // Leader can't talk to n3 246 c1.Disconnect("n0", "n3", ErrConnRefused) 247 c1.Disconnect("n3", "n0", ErrConnRefused) 248 249 // n2 and n4 can't talk 250 c1.Disconnect("n2", "n4", ErrConnRefused) 251 c1.Disconnect("n4", "n2", ErrConnRefused) 252 253 // n1 and n3 can't talk 254 c1.Disconnect("n1", "n3", ErrConnRefused) 255 c1.Disconnect("n3", "n1", ErrConnRefused) 256 257 // Cluster should retain n0 as leader in the face on unstable cluster 258 for i := 0; i < 12; i++ { 259 leader := c1.GetLeader() 260 require.NotNil(t, leader) 261 require.Equal(t, leader.GetLeader(), "n0") 262 time.Sleep(time.Millisecond * 400) 263 } 264 265 // Should retain leader once communication is restored 266 c1.ClearErrors() 267 268 for i := 0; i < 12; i++ { 269 leader := c1.GetLeader() 270 require.NotNil(t, leader) 271 require.Equal(t, leader.GetLeader(), "n0") 272 time.Sleep(time.Millisecond * 400) 273 } 274 } 275 276 func TestIsolatedLeader(t *testing.T) { 277 c1 := NewTestCluster(t) 278 createCluster(t, c1) 279 defer c1.Close() 280 281 // Create a cluster where the leader become isolated from the rest 282 // of the cluster. 283 // 284 // Diagram: lines indicate connectivity 285 // between nodes and n0 is leader 286 // 287 // (n0)----(n1)----(n4) 288 // / \ / 289 // / \ / 290 // / \ / 291 // (n2)----(n3) 292 // 293 require.Equal(t, c1.GetLeader().GetLeader(), "n0") 294 295 // Leader can't talk to n2 296 c1.Disconnect("n0", "n2", ErrConnRefused) 297 c1.Disconnect("n2", "n0", ErrConnRefused) 298 299 // Leader can't talk to n3 300 c1.Disconnect("n0", "n3", ErrConnRefused) 301 c1.Disconnect("n3", "n0", ErrConnRefused) 302 303 // Leader can't talk to n4 304 c1.Disconnect("n0", "n4", ErrConnRefused) 305 c1.Disconnect("n4", "n0", ErrConnRefused) 306 307 // Leader should realize it doesn't have a quorum of 308 // heartbeats and step down and remaining cluster should 309 // elect a new leader 310 for i := 0; i < 20; i++ { 311 leader := c1.GetLeader() 312 if leader == nil { 313 goto sleep 314 } 315 316 // Leader should no longer be n0 317 if leader.GetLeader() != "n0" { 318 // A node in the new cluster must agree and have elected a new leader 319 l := c1.Nodes["n4"].Node.GetLeader() 320 if l != "" && l == "n0" { 321 break 322 } 323 } 324 sleep: 325 time.Sleep(time.Millisecond * 500) 326 } 327 require.NotNil(t, c1.GetLeader()) 328 require.NotEqual(t, c1.GetLeader().GetLeader(), "n0") 329 // Note: In the case where n1 is elected the new leader, 330 // n0 will know that n1 is the new leader sooner than later 331 // since connectivity from n0 to n1 was never interrupted. 332 // fmt.Printf("Cluster: %#v\n", c1.GetClusterStatus()) 333 334 // Should persist new leader once communication is restored 335 c1.ClearErrors() 336 337 // Should pick up the leadership from the rest of the cluster 338 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 339 leader := c1.Nodes["n0"].Node.GetLeader() 340 assert.NotEqual(t, leader, "") 341 }) 342 343 s, err := c1.Nodes["n0"].Node.GetState(context.Background()) 344 fmt.Printf("State: %#v\n", s) 345 require.NoError(t, err) 346 assert.Equal(t, "Follower", s.State) 347 } 348 349 func TestMinimumQuorum(t *testing.T) { 350 c := NewTestCluster(t) 351 352 cfg := &election.Config{ 353 NetworkTimeout: time.Second, 354 HeartBeatTimeout: time.Second, 355 LeaderQuorumTimeout: time.Second * 2, 356 ElectionTimeout: time.Second * 2, 357 MinimumQuorum: 2, 358 } 359 360 err := c.SpawnNode("n0", cfg) 361 require.NoError(t, err) 362 363 time.Sleep(time.Second * 5) 364 365 // Ensure n0 is not leader 366 status := c.GetClusterStatus() 367 require.NotEqual(t, "n0", status["n0"]) 368 369 err = c.SpawnNode("n1", cfg) 370 require.NoError(t, err) 371 372 // Should elect a leader 373 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 374 status := c.GetClusterStatus() 375 assert.NotEqual(t, status["n0"], "") 376 }) 377 378 status = c.GetClusterStatus() 379 var leader string 380 381 // Shutdown the follower 382 if status["n0"] == "n0" { 383 err = c.Remove("n1").Node.Stop(context.Background()) 384 require.NoError(t, err) 385 leader = "n0" 386 } else { 387 err = c.Remove("n0").Node.Stop(context.Background()) 388 require.NoError(t, err) 389 leader = "n1" 390 } 391 392 // The leader should detect it no longer has MinimumQuorum and step down 393 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 394 status := c.GetClusterStatus() 395 assert.Equal(t, status[leader], "") 396 }) 397 } 398 399 func TestResign(t *testing.T) { 400 c1 := NewTestCluster(t) 401 createCluster(t, c1) 402 defer c1.Close() 403 404 testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) { 405 assert.NotNil(t, c1.GetLeader()) 406 }) 407 408 leader := c1.GetLeader() 409 410 // Calling resign on a follower should have no effect 411 err := c1.Nodes["n1"].Node.Resign(context.Background()) 412 assert.ErrorContains(t, err, "not the leader") 413 414 for i := 0; i < 10; i++ { 415 if c1.GetLeader() != leader { 416 require.FailNow(t, "leader should not have changed") 417 } 418 time.Sleep(time.Millisecond * 500) 419 } 420 // Calling resign on the leader should give up leader 421 err = c1.Nodes["n0"].Node.Resign(context.Background()) 422 require.NoError(t, err) 423 424 testutil.UntilPass(t, 30, time.Second, func(t testutil.TestingT) { 425 assert.NotEqual(t, leader, c1.GetLeader()) 426 }) 427 } 428 429 func TestResignSingleNode(t *testing.T) { 430 c := NewTestCluster(t) 431 err := c.SpawnNode("n0", cfg) 432 require.NoError(t, err) 433 defer c.Close() 434 435 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 436 status := c.GetClusterStatus() 437 assert.Equal(t, ClusterStatus{ 438 "n0": "n0", 439 }, status) 440 }) 441 442 err = c.Nodes["n0"].Node.Resign(context.Background()) 443 require.NoError(t, err) 444 445 // n0 will eventually become leader again 446 testutil.UntilPass(t, 10, time.Second, func(t testutil.TestingT) { 447 status := c.GetClusterStatus() 448 assert.Equal(t, ClusterStatus{ 449 "n0": "n0", 450 }, status) 451 }) 452 }