github.com/clly/consul@v1.4.5/agent/consul/autopilot_test.go (about)

     1  package consul
     2  
     3  import (
     4  	"os"
     5  	"testing"
     6  	"time"
     7  
     8  	"github.com/hashicorp/consul/testrpc"
     9  	"github.com/hashicorp/consul/testutil/retry"
    10  	"github.com/hashicorp/raft"
    11  	"github.com/hashicorp/serf/serf"
    12  )
    13  
    14  func TestAutopilot_IdempotentShutdown(t *testing.T) {
    15  	dir1, s1 := testServerWithConfig(t, nil)
    16  	defer os.RemoveAll(dir1)
    17  	defer s1.Shutdown()
    18  	retry.Run(t, func(r *retry.R) { r.Check(waitForLeader(s1)) })
    19  
    20  	s1.autopilot.Start()
    21  	s1.autopilot.Start()
    22  	s1.autopilot.Start()
    23  	s1.autopilot.Stop()
    24  	s1.autopilot.Stop()
    25  	s1.autopilot.Stop()
    26  }
    27  
    28  func TestAutopilot_CleanupDeadServer(t *testing.T) {
    29  	t.Parallel()
    30  	for i := 1; i <= 3; i++ {
    31  		testCleanupDeadServer(t, i)
    32  	}
    33  }
    34  
    35  func testCleanupDeadServer(t *testing.T, raftVersion int) {
    36  	dc := "dc1"
    37  	conf := func(c *Config) {
    38  		c.Datacenter = dc
    39  		c.Bootstrap = false
    40  		c.BootstrapExpect = 3
    41  		c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
    42  	}
    43  	dir1, s1 := testServerWithConfig(t, conf)
    44  	defer os.RemoveAll(dir1)
    45  	defer s1.Shutdown()
    46  
    47  	dir2, s2 := testServerWithConfig(t, conf)
    48  	defer os.RemoveAll(dir2)
    49  	defer s2.Shutdown()
    50  
    51  	dir3, s3 := testServerWithConfig(t, conf)
    52  	defer os.RemoveAll(dir3)
    53  	defer s3.Shutdown()
    54  
    55  	servers := []*Server{s1, s2, s3}
    56  
    57  	// Try to join
    58  	joinLAN(t, s2, s1)
    59  	joinLAN(t, s3, s1)
    60  
    61  	for _, s := range servers {
    62  		testrpc.WaitForLeader(t, s.RPC, dc)
    63  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
    64  	}
    65  
    66  	// Bring up a new server
    67  	dir4, s4 := testServerWithConfig(t, conf)
    68  	defer os.RemoveAll(dir4)
    69  	defer s4.Shutdown()
    70  
    71  	// Kill a non-leader server
    72  	s3.Shutdown()
    73  	retry.Run(t, func(r *retry.R) {
    74  		alive := 0
    75  		for _, m := range s1.LANMembers() {
    76  			if m.Status == serf.StatusAlive {
    77  				alive++
    78  			}
    79  		}
    80  		if alive != 2 {
    81  			r.Fatal(nil)
    82  		}
    83  	})
    84  
    85  	// Join the new server
    86  	joinLAN(t, s4, s1)
    87  	servers[2] = s4
    88  
    89  	// Make sure the dead server is removed and we're back to 3 total peers
    90  	for _, s := range servers {
    91  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
    92  	}
    93  }
    94  
    95  func TestAutopilot_CleanupDeadNonvoter(t *testing.T) {
    96  	dir1, s1 := testServer(t)
    97  	defer os.RemoveAll(dir1)
    98  	defer s1.Shutdown()
    99  
   100  	dir2, s2 := testServerDCBootstrap(t, "dc1", false)
   101  	defer os.RemoveAll(dir2)
   102  	defer s2.Shutdown()
   103  
   104  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
   105  
   106  	// Have s2 join and then shut it down immediately before it gets a chance to
   107  	// be promoted to a voter.
   108  	joinLAN(t, s2, s1)
   109  	retry.Run(t, func(r *retry.R) {
   110  		r.Check(wantRaft([]*Server{s1, s2}))
   111  	})
   112  	s2.Shutdown()
   113  
   114  	retry.Run(t, func(r *retry.R) {
   115  		r.Check(wantRaft([]*Server{s1}))
   116  	})
   117  }
   118  
   119  func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
   120  	t.Parallel()
   121  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   122  		c.Datacenter = "dc1"
   123  		c.Bootstrap = true
   124  	})
   125  	defer os.RemoveAll(dir1)
   126  	defer s1.Shutdown()
   127  
   128  	conf := func(c *Config) {
   129  		c.Datacenter = "dc1"
   130  		c.Bootstrap = false
   131  	}
   132  
   133  	dir2, s2 := testServerWithConfig(t, conf)
   134  	defer os.RemoveAll(dir2)
   135  	defer s2.Shutdown()
   136  
   137  	dir3, s3 := testServerWithConfig(t, conf)
   138  	defer os.RemoveAll(dir3)
   139  	defer s3.Shutdown()
   140  
   141  	dir4, s4 := testServerWithConfig(t, conf)
   142  	defer os.RemoveAll(dir4)
   143  	defer s4.Shutdown()
   144  
   145  	dir5, s5 := testServerWithConfig(t, conf)
   146  	defer os.RemoveAll(dir5)
   147  	defer s5.Shutdown()
   148  
   149  	servers := []*Server{s1, s2, s3, s4, s5}
   150  
   151  	// Join the servers to s1, and wait until they are all promoted to
   152  	// voters.
   153  	for _, s := range servers[1:] {
   154  		joinLAN(t, s, s1)
   155  	}
   156  	retry.Run(t, func(r *retry.R) {
   157  		r.Check(wantRaft(servers))
   158  		for _, s := range servers {
   159  			r.Check(wantPeers(s, 5))
   160  		}
   161  	})
   162  
   163  	// Kill a non-leader server
   164  	s4.Shutdown()
   165  
   166  	// Should be removed from the peers automatically
   167  	servers = []*Server{s1, s2, s3, s5}
   168  	retry.Run(t, func(r *retry.R) {
   169  		r.Check(wantRaft(servers))
   170  		for _, s := range servers {
   171  			r.Check(wantPeers(s, 4))
   172  		}
   173  	})
   174  }
   175  
   176  func TestAutopilot_RollingUpdate(t *testing.T) {
   177  	t.Parallel()
   178  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   179  		c.Datacenter = "dc1"
   180  		c.Bootstrap = true
   181  	})
   182  	defer os.RemoveAll(dir1)
   183  	defer s1.Shutdown()
   184  
   185  	conf := func(c *Config) {
   186  		c.Datacenter = "dc1"
   187  		c.Bootstrap = false
   188  	}
   189  
   190  	dir2, s2 := testServerWithConfig(t, conf)
   191  	defer os.RemoveAll(dir2)
   192  	defer s2.Shutdown()
   193  
   194  	dir3, s3 := testServerWithConfig(t, conf)
   195  	defer os.RemoveAll(dir3)
   196  	defer s3.Shutdown()
   197  
   198  	// Join the servers to s1, and wait until they are all promoted to
   199  	// voters.
   200  	servers := []*Server{s1, s2, s3}
   201  	for _, s := range servers[1:] {
   202  		joinLAN(t, s, s1)
   203  	}
   204  	retry.Run(t, func(r *retry.R) {
   205  		r.Check(wantRaft(servers))
   206  		for _, s := range servers {
   207  			r.Check(wantPeers(s, 3))
   208  		}
   209  	})
   210  
   211  	// Add one more server like we are doing a rolling update.
   212  	dir4, s4 := testServerWithConfig(t, conf)
   213  	defer os.RemoveAll(dir4)
   214  	defer s4.Shutdown()
   215  	joinLAN(t, s1, s4)
   216  	servers = append(servers, s4)
   217  	retry.Run(t, func(r *retry.R) {
   218  		r.Check(wantRaft(servers))
   219  		for _, s := range servers {
   220  			r.Check(wantPeers(s, 3))
   221  		}
   222  	})
   223  
   224  	// Now kill one of the "old" nodes like we are doing a rolling update.
   225  	s3.Shutdown()
   226  
   227  	isVoter := func() bool {
   228  		future := s1.raft.GetConfiguration()
   229  		if err := future.Error(); err != nil {
   230  			t.Fatalf("err: %v", err)
   231  		}
   232  		for _, s := range future.Configuration().Servers {
   233  			if string(s.ID) == string(s4.config.NodeID) {
   234  				return s.Suffrage == raft.Voter
   235  			}
   236  		}
   237  		t.Fatalf("didn't find s4")
   238  		return false
   239  	}
   240  
   241  	// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
   242  	// removed.
   243  	servers = []*Server{s1, s2, s4}
   244  	retry.Run(t, func(r *retry.R) {
   245  		r.Check(wantRaft(servers))
   246  		for _, s := range servers {
   247  			r.Check(wantPeers(s, 3))
   248  		}
   249  		if !isVoter() {
   250  			r.Fatalf("should be a voter")
   251  		}
   252  	})
   253  }
   254  
   255  func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
   256  	t.Parallel()
   257  	dir1, s1 := testServerDCBootstrap(t, "dc1", true)
   258  	defer os.RemoveAll(dir1)
   259  	defer s1.Shutdown()
   260  
   261  	dir2, s2 := testServerDCBootstrap(t, "dc1", false)
   262  	defer os.RemoveAll(dir2)
   263  	defer s2.Shutdown()
   264  
   265  	dir3, s3 := testServerDCBootstrap(t, "dc1", false)
   266  	defer os.RemoveAll(dir3)
   267  	defer s3.Shutdown()
   268  
   269  	dir4, s4 := testServerDCBootstrap(t, "dc1", false)
   270  	defer os.RemoveAll(dir4)
   271  	defer s4.Shutdown()
   272  
   273  	servers := []*Server{s1, s2, s3}
   274  
   275  	// Join the servers to s1
   276  	for _, s := range servers[1:] {
   277  		joinLAN(t, s, s1)
   278  	}
   279  
   280  	for _, s := range servers {
   281  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   282  	}
   283  
   284  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
   285  
   286  	// Add s4 to peers directly
   287  	s1.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(joinAddrLAN(s4)), 0, 0)
   288  
   289  	// Verify we have 4 peers
   290  	peers, err := s1.numPeers()
   291  	if err != nil {
   292  		t.Fatal(err)
   293  	}
   294  	if peers != 4 {
   295  		t.Fatalf("bad: %v", peers)
   296  	}
   297  
   298  	// Wait for s4 to be removed
   299  	for _, s := range []*Server{s1, s2, s3} {
   300  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   301  	}
   302  }
   303  
   304  func TestAutopilot_PromoteNonVoter(t *testing.T) {
   305  	t.Parallel()
   306  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   307  		c.Datacenter = "dc1"
   308  		c.Bootstrap = true
   309  		c.RaftConfig.ProtocolVersion = 3
   310  		c.AutopilotConfig.ServerStabilizationTime = 200 * time.Millisecond
   311  		c.ServerHealthInterval = 100 * time.Millisecond
   312  		c.AutopilotInterval = 100 * time.Millisecond
   313  	})
   314  	defer os.RemoveAll(dir1)
   315  	defer s1.Shutdown()
   316  	codec := rpcClient(t, s1)
   317  	defer codec.Close()
   318  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
   319  
   320  	dir2, s2 := testServerWithConfig(t, func(c *Config) {
   321  		c.Datacenter = "dc1"
   322  		c.Bootstrap = false
   323  		c.RaftConfig.ProtocolVersion = 3
   324  	})
   325  	defer os.RemoveAll(dir2)
   326  	defer s2.Shutdown()
   327  	joinLAN(t, s2, s1)
   328  
   329  	// Make sure we see it as a nonvoter initially. We wait until half
   330  	// the stabilization period has passed.
   331  	retry.Run(t, func(r *retry.R) {
   332  		future := s1.raft.GetConfiguration()
   333  		if err := future.Error(); err != nil {
   334  			r.Fatal(err)
   335  		}
   336  
   337  		servers := future.Configuration().Servers
   338  		if len(servers) != 2 {
   339  			r.Fatalf("bad: %v", servers)
   340  		}
   341  		if servers[1].Suffrage != raft.Nonvoter {
   342  			r.Fatalf("bad: %v", servers)
   343  		}
   344  		health := s1.autopilot.GetServerHealth(string(servers[1].ID))
   345  		if health == nil {
   346  			r.Fatal("nil health")
   347  		}
   348  		if !health.Healthy {
   349  			r.Fatalf("bad: %v", health)
   350  		}
   351  		if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
   352  			r.Fatal("stable period not elapsed")
   353  		}
   354  	})
   355  
   356  	// Make sure it ends up as a voter.
   357  	retry.Run(t, func(r *retry.R) {
   358  		future := s1.raft.GetConfiguration()
   359  		if err := future.Error(); err != nil {
   360  			r.Fatal(err)
   361  		}
   362  
   363  		servers := future.Configuration().Servers
   364  		if len(servers) != 2 {
   365  			r.Fatalf("bad: %v", servers)
   366  		}
   367  		if servers[1].Suffrage != raft.Voter {
   368  			r.Fatalf("bad: %v", servers)
   369  		}
   370  	})
   371  }