github.com/manicqin/nomad@v0.9.5/nomad/autopilot_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"testing"
     5  	"time"
     6  
     7  	"fmt"
     8  
     9  	"github.com/hashicorp/consul/agent/consul/autopilot"
    10  	"github.com/hashicorp/consul/testutil/retry"
    11  	"github.com/hashicorp/nomad/testutil"
    12  	"github.com/hashicorp/raft"
    13  	"github.com/hashicorp/serf/serf"
    14  )
    15  
    16  // wantPeers determines whether the server has the given
    17  // number of voting raft peers.
    18  func wantPeers(s *Server, peers int) error {
    19  	future := s.raft.GetConfiguration()
    20  	if err := future.Error(); err != nil {
    21  		return err
    22  	}
    23  
    24  	n := autopilot.NumPeers(future.Configuration())
    25  	if got, want := n, peers; got != want {
    26  		return fmt.Errorf("got %d peers want %d", got, want)
    27  	}
    28  	return nil
    29  }
    30  
    31  // wantRaft determines if the servers have all of each other in their
    32  // Raft configurations,
    33  func wantRaft(servers []*Server) error {
    34  	// Make sure all the servers are represented in the Raft config,
    35  	// and that there are no extras.
    36  	verifyRaft := func(c raft.Configuration) error {
    37  		want := make(map[raft.ServerID]bool)
    38  		for _, s := range servers {
    39  			want[s.config.RaftConfig.LocalID] = true
    40  		}
    41  
    42  		for _, s := range c.Servers {
    43  			if !want[s.ID] {
    44  				return fmt.Errorf("don't want %q", s.ID)
    45  			}
    46  			delete(want, s.ID)
    47  		}
    48  
    49  		if len(want) > 0 {
    50  			return fmt.Errorf("didn't find %v", want)
    51  		}
    52  		return nil
    53  	}
    54  
    55  	for _, s := range servers {
    56  		future := s.raft.GetConfiguration()
    57  		if err := future.Error(); err != nil {
    58  			return err
    59  		}
    60  		if err := verifyRaft(future.Configuration()); err != nil {
    61  			return err
    62  		}
    63  	}
    64  	return nil
    65  }
    66  
    67  func TestAutopilot_CleanupDeadServer(t *testing.T) {
    68  	t.Parallel()
    69  	for i := 1; i <= 3; i++ {
    70  		testCleanupDeadServer(t, i)
    71  	}
    72  }
    73  
    74  func testCleanupDeadServer(t *testing.T, raftVersion int) {
    75  	conf := func(c *Config) {
    76  		c.DevDisableBootstrap = true
    77  		c.BootstrapExpect = 3
    78  		c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
    79  	}
    80  
    81  	s1, cleanupS1 := TestServer(t, conf)
    82  	defer cleanupS1()
    83  
    84  	s2, cleanupS2 := TestServer(t, conf)
    85  	defer cleanupS2()
    86  
    87  	s3, cleanupS3 := TestServer(t, conf)
    88  	defer cleanupS3()
    89  
    90  	servers := []*Server{s1, s2, s3}
    91  
    92  	// Try to join
    93  	TestJoin(t, s1, s2, s3)
    94  
    95  	for _, s := range servers {
    96  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
    97  	}
    98  
    99  	// Bring up a new server
   100  	s4, cleanupS4 := TestServer(t, conf)
   101  	defer cleanupS4()
   102  
   103  	// Kill a non-leader server
   104  	s3.Shutdown()
   105  	retry.Run(t, func(r *retry.R) {
   106  		alive := 0
   107  		for _, m := range s1.Members() {
   108  			if m.Status == serf.StatusAlive {
   109  				alive++
   110  			}
   111  		}
   112  		if alive != 2 {
   113  			r.Fatal(nil)
   114  		}
   115  	})
   116  
   117  	// Join the new server
   118  	TestJoin(t, s1, s4)
   119  	servers[2] = s4
   120  
   121  	// Make sure the dead server is removed and we're back to 3 total peers
   122  	for _, s := range servers {
   123  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   124  	}
   125  }
   126  
   127  func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
   128  	t.Parallel()
   129  
   130  	s1, cleanupS1 := TestServer(t, nil)
   131  	defer cleanupS1()
   132  
   133  	conf := func(c *Config) {
   134  		c.DevDisableBootstrap = true
   135  	}
   136  
   137  	s2, cleanupS2 := TestServer(t, conf)
   138  	defer cleanupS2()
   139  
   140  	s3, cleanupS3 := TestServer(t, conf)
   141  	defer cleanupS3()
   142  
   143  	s4, cleanupS4 := TestServer(t, conf)
   144  	defer cleanupS4()
   145  
   146  	s5, cleanupS5 := TestServer(t, conf)
   147  	defer cleanupS5()
   148  
   149  	servers := []*Server{s1, s2, s3, s4, s5}
   150  
   151  	// Join the servers to s1, and wait until they are all promoted to
   152  	// voters.
   153  	TestJoin(t, s1, servers[1:]...)
   154  	retry.Run(t, func(r *retry.R) {
   155  		r.Check(wantRaft(servers))
   156  		for _, s := range servers {
   157  			r.Check(wantPeers(s, 5))
   158  		}
   159  	})
   160  
   161  	// Kill a non-leader server
   162  	s4.Shutdown()
   163  
   164  	// Should be removed from the peers automatically
   165  	servers = []*Server{s1, s2, s3, s5}
   166  	retry.Run(t, func(r *retry.R) {
   167  		r.Check(wantRaft(servers))
   168  		for _, s := range servers {
   169  			r.Check(wantPeers(s, 4))
   170  		}
   171  	})
   172  }
   173  
   174  func TestAutopilot_RollingUpdate(t *testing.T) {
   175  	t.Parallel()
   176  
   177  	s1, cleanupS1 := TestServer(t, func(c *Config) {
   178  		c.RaftConfig.ProtocolVersion = 3
   179  	})
   180  	defer cleanupS1()
   181  
   182  	conf := func(c *Config) {
   183  		c.DevDisableBootstrap = true
   184  		c.RaftConfig.ProtocolVersion = 3
   185  	}
   186  
   187  	s2, cleanupS2 := TestServer(t, conf)
   188  	defer cleanupS2()
   189  
   190  	s3, cleanupS3 := TestServer(t, conf)
   191  	defer cleanupS3()
   192  
   193  	// Join the servers to s1, and wait until they are all promoted to
   194  	// voters.
   195  	servers := []*Server{s1, s2, s3}
   196  	TestJoin(t, s1, s2, s3)
   197  	retry.Run(t, func(r *retry.R) {
   198  		r.Check(wantRaft(servers))
   199  		for _, s := range servers {
   200  			r.Check(wantPeers(s, 3))
   201  		}
   202  	})
   203  
   204  	// Add one more server like we are doing a rolling update.
   205  	s4, cleanupS4 := TestServer(t, conf)
   206  	defer cleanupS4()
   207  	TestJoin(t, s1, s4)
   208  	servers = append(servers, s4)
   209  	retry.Run(t, func(r *retry.R) {
   210  		r.Check(wantRaft(servers))
   211  		for _, s := range servers {
   212  			r.Check(wantPeers(s, 3))
   213  		}
   214  	})
   215  
   216  	// Now kill one of the "old" nodes like we are doing a rolling update.
   217  	s3.Shutdown()
   218  
   219  	isVoter := func() bool {
   220  		future := s1.raft.GetConfiguration()
   221  		if err := future.Error(); err != nil {
   222  			t.Fatalf("err: %v", err)
   223  		}
   224  		for _, s := range future.Configuration().Servers {
   225  			if string(s.ID) == string(s4.config.NodeID) {
   226  				return s.Suffrage == raft.Voter
   227  			}
   228  		}
   229  		t.Fatalf("didn't find s4")
   230  		return false
   231  	}
   232  
   233  	// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
   234  	// removed.
   235  	servers = []*Server{s1, s2, s4}
   236  	retry.Run(t, func(r *retry.R) {
   237  		r.Check(wantRaft(servers))
   238  		for _, s := range servers {
   239  			r.Check(wantPeers(s, 3))
   240  		}
   241  		if !isVoter() {
   242  			r.Fatalf("should be a voter")
   243  		}
   244  	})
   245  }
   246  
   247  func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
   248  	t.Skip("TestAutopilot_CleanupDeadServer is very flaky, removing it for now")
   249  	t.Parallel()
   250  
   251  	s1, cleanupS1 := TestServer(t, nil)
   252  	defer cleanupS1()
   253  
   254  	conf := func(c *Config) {
   255  		c.DevDisableBootstrap = true
   256  	}
   257  	s2, cleanupS2 := TestServer(t, conf)
   258  	defer cleanupS2()
   259  
   260  	s3, cleanupS3 := TestServer(t, conf)
   261  	defer cleanupS3()
   262  
   263  	s4, cleanupS4 := TestServer(t, conf)
   264  	defer cleanupS4()
   265  
   266  	servers := []*Server{s1, s2, s3}
   267  
   268  	// Join the servers to s1
   269  	TestJoin(t, s1, s2, s3)
   270  
   271  	leader := waitForStableLeadership(t, servers)
   272  
   273  	// Add s4 to peers directly
   274  	addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port)
   275  	future := leader.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0)
   276  	if err := future.Error(); err != nil {
   277  		t.Fatal(err)
   278  	}
   279  
   280  	// Verify we have 4 peers
   281  	peers, err := s1.numPeers()
   282  	if err != nil {
   283  		t.Fatal(err)
   284  	}
   285  	if peers != 4 {
   286  		t.Fatalf("bad: %v", peers)
   287  	}
   288  
   289  	// Wait for s4 to be removed
   290  	for _, s := range []*Server{s1, s2, s3} {
   291  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   292  	}
   293  }
   294  
   295  func TestAutopilot_PromoteNonVoter(t *testing.T) {
   296  	t.Parallel()
   297  
   298  	s1, cleanupS1 := TestServer(t, func(c *Config) {
   299  		c.RaftConfig.ProtocolVersion = 3
   300  	})
   301  	defer cleanupS1()
   302  	codec := rpcClient(t, s1)
   303  	defer codec.Close()
   304  	testutil.WaitForLeader(t, s1.RPC)
   305  
   306  	s2, cleanupS2 := TestServer(t, func(c *Config) {
   307  		c.DevDisableBootstrap = true
   308  		c.RaftConfig.ProtocolVersion = 3
   309  	})
   310  	defer cleanupS2()
   311  	TestJoin(t, s1, s2)
   312  
   313  	// Make sure we see it as a nonvoter initially. We wait until half
   314  	// the stabilization period has passed.
   315  	retry.Run(t, func(r *retry.R) {
   316  		future := s1.raft.GetConfiguration()
   317  		if err := future.Error(); err != nil {
   318  			r.Fatal(err)
   319  		}
   320  
   321  		servers := future.Configuration().Servers
   322  		if len(servers) != 2 {
   323  			r.Fatalf("bad: %v", servers)
   324  		}
   325  		if servers[1].Suffrage != raft.Nonvoter {
   326  			r.Fatalf("bad: %v", servers)
   327  		}
   328  		health := s1.autopilot.GetServerHealth(string(servers[1].ID))
   329  		if health == nil {
   330  			r.Fatalf("nil health, %v", s1.autopilot.GetClusterHealth())
   331  		}
   332  		if !health.Healthy {
   333  			r.Fatalf("bad: %v", health)
   334  		}
   335  		if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
   336  			r.Fatal("stable period not elapsed")
   337  		}
   338  	})
   339  
   340  	// Make sure it ends up as a voter.
   341  	retry.Run(t, func(r *retry.R) {
   342  		future := s1.raft.GetConfiguration()
   343  		if err := future.Error(); err != nil {
   344  			r.Fatal(err)
   345  		}
   346  
   347  		servers := future.Configuration().Servers
   348  		if len(servers) != 2 {
   349  			r.Fatalf("bad: %v", servers)
   350  		}
   351  		if servers[1].Suffrage != raft.Voter {
   352  			r.Fatalf("bad: %v", servers)
   353  		}
   354  	})
   355  }