github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/nomad/autopilot_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"testing"
     5  	"time"
     6  
     7  	"fmt"
     8  
     9  	"github.com/hashicorp/consul/agent/consul/autopilot"
    10  	"github.com/hashicorp/consul/testutil/retry"
    11  	"github.com/hashicorp/nomad/testutil"
    12  	"github.com/hashicorp/raft"
    13  	"github.com/hashicorp/serf/serf"
    14  )
    15  
    16  // wantPeers determines whether the server has the given
    17  // number of voting raft peers.
    18  func wantPeers(s *Server, peers int) error {
    19  	future := s.raft.GetConfiguration()
    20  	if err := future.Error(); err != nil {
    21  		return err
    22  	}
    23  
    24  	n := autopilot.NumPeers(future.Configuration())
    25  	if got, want := n, peers; got != want {
    26  		return fmt.Errorf("got %d peers want %d", got, want)
    27  	}
    28  	return nil
    29  }
    30  
    31  // wantRaft determines if the servers have all of each other in their
    32  // Raft configurations,
    33  func wantRaft(servers []*Server) error {
    34  	// Make sure all the servers are represented in the Raft config,
    35  	// and that there are no extras.
    36  	verifyRaft := func(c raft.Configuration) error {
    37  		want := make(map[raft.ServerID]bool)
    38  		for _, s := range servers {
    39  			want[s.config.RaftConfig.LocalID] = true
    40  		}
    41  
    42  		for _, s := range c.Servers {
    43  			if !want[s.ID] {
    44  				return fmt.Errorf("don't want %q", s.ID)
    45  			}
    46  			delete(want, s.ID)
    47  		}
    48  
    49  		if len(want) > 0 {
    50  			return fmt.Errorf("didn't find %v", want)
    51  		}
    52  		return nil
    53  	}
    54  
    55  	for _, s := range servers {
    56  		future := s.raft.GetConfiguration()
    57  		if err := future.Error(); err != nil {
    58  			return err
    59  		}
    60  		if err := verifyRaft(future.Configuration()); err != nil {
    61  			return err
    62  		}
    63  	}
    64  	return nil
    65  }
    66  
    67  func TestAutopilot_CleanupDeadServer(t *testing.T) {
    68  	t.Parallel()
    69  	for i := 1; i <= 3; i++ {
    70  		testCleanupDeadServer(t, i)
    71  	}
    72  }
    73  
    74  func testCleanupDeadServer(t *testing.T, raftVersion int) {
    75  	conf := func(c *Config) {
    76  		c.DevDisableBootstrap = true
    77  		c.BootstrapExpect = 3
    78  		c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
    79  	}
    80  	s1 := TestServer(t, conf)
    81  	defer s1.Shutdown()
    82  
    83  	s2 := TestServer(t, conf)
    84  	defer s2.Shutdown()
    85  
    86  	s3 := TestServer(t, conf)
    87  	defer s3.Shutdown()
    88  
    89  	servers := []*Server{s1, s2, s3}
    90  
    91  	// Try to join
    92  	TestJoin(t, s1, s2, s3)
    93  
    94  	for _, s := range servers {
    95  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
    96  	}
    97  
    98  	// Bring up a new server
    99  	s4 := TestServer(t, conf)
   100  	defer s4.Shutdown()
   101  
   102  	// Kill a non-leader server
   103  	s3.Shutdown()
   104  	retry.Run(t, func(r *retry.R) {
   105  		alive := 0
   106  		for _, m := range s1.Members() {
   107  			if m.Status == serf.StatusAlive {
   108  				alive++
   109  			}
   110  		}
   111  		if alive != 2 {
   112  			r.Fatal(nil)
   113  		}
   114  	})
   115  
   116  	// Join the new server
   117  	TestJoin(t, s1, s4)
   118  	servers[2] = s4
   119  
   120  	// Make sure the dead server is removed and we're back to 3 total peers
   121  	for _, s := range servers {
   122  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   123  	}
   124  }
   125  
   126  func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
   127  	t.Parallel()
   128  	s1 := TestServer(t, nil)
   129  	defer s1.Shutdown()
   130  
   131  	conf := func(c *Config) {
   132  		c.DevDisableBootstrap = true
   133  	}
   134  
   135  	s2 := TestServer(t, conf)
   136  	defer s2.Shutdown()
   137  
   138  	s3 := TestServer(t, conf)
   139  	defer s3.Shutdown()
   140  
   141  	s4 := TestServer(t, conf)
   142  	defer s4.Shutdown()
   143  
   144  	s5 := TestServer(t, conf)
   145  	defer s5.Shutdown()
   146  
   147  	servers := []*Server{s1, s2, s3, s4, s5}
   148  
   149  	// Join the servers to s1, and wait until they are all promoted to
   150  	// voters.
   151  	TestJoin(t, s1, servers[1:]...)
   152  	retry.Run(t, func(r *retry.R) {
   153  		r.Check(wantRaft(servers))
   154  		for _, s := range servers {
   155  			r.Check(wantPeers(s, 5))
   156  		}
   157  	})
   158  
   159  	// Kill a non-leader server
   160  	s4.Shutdown()
   161  
   162  	// Should be removed from the peers automatically
   163  	servers = []*Server{s1, s2, s3, s5}
   164  	retry.Run(t, func(r *retry.R) {
   165  		r.Check(wantRaft(servers))
   166  		for _, s := range servers {
   167  			r.Check(wantPeers(s, 4))
   168  		}
   169  	})
   170  }
   171  
   172  func TestAutopilot_RollingUpdate(t *testing.T) {
   173  	t.Parallel()
   174  	s1 := TestServer(t, func(c *Config) {
   175  		c.RaftConfig.ProtocolVersion = 3
   176  	})
   177  	defer s1.Shutdown()
   178  
   179  	conf := func(c *Config) {
   180  		c.DevDisableBootstrap = true
   181  		c.RaftConfig.ProtocolVersion = 3
   182  	}
   183  
   184  	s2 := TestServer(t, conf)
   185  	defer s2.Shutdown()
   186  
   187  	s3 := TestServer(t, conf)
   188  	defer s3.Shutdown()
   189  
   190  	// Join the servers to s1, and wait until they are all promoted to
   191  	// voters.
   192  	servers := []*Server{s1, s2, s3}
   193  	TestJoin(t, s1, s2, s3)
   194  	retry.Run(t, func(r *retry.R) {
   195  		r.Check(wantRaft(servers))
   196  		for _, s := range servers {
   197  			r.Check(wantPeers(s, 3))
   198  		}
   199  	})
   200  
   201  	// Add one more server like we are doing a rolling update.
   202  	s4 := TestServer(t, conf)
   203  	defer s4.Shutdown()
   204  	TestJoin(t, s1, s4)
   205  	servers = append(servers, s4)
   206  	retry.Run(t, func(r *retry.R) {
   207  		r.Check(wantRaft(servers))
   208  		for _, s := range servers {
   209  			r.Check(wantPeers(s, 3))
   210  		}
   211  	})
   212  
   213  	// Now kill one of the "old" nodes like we are doing a rolling update.
   214  	s3.Shutdown()
   215  
   216  	isVoter := func() bool {
   217  		future := s1.raft.GetConfiguration()
   218  		if err := future.Error(); err != nil {
   219  			t.Fatalf("err: %v", err)
   220  		}
   221  		for _, s := range future.Configuration().Servers {
   222  			if string(s.ID) == string(s4.config.NodeID) {
   223  				return s.Suffrage == raft.Voter
   224  			}
   225  		}
   226  		t.Fatalf("didn't find s4")
   227  		return false
   228  	}
   229  
   230  	// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
   231  	// removed.
   232  	servers = []*Server{s1, s2, s4}
   233  	retry.Run(t, func(r *retry.R) {
   234  		r.Check(wantRaft(servers))
   235  		for _, s := range servers {
   236  			r.Check(wantPeers(s, 3))
   237  		}
   238  		if !isVoter() {
   239  			r.Fatalf("should be a voter")
   240  		}
   241  	})
   242  }
   243  
   244  func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
   245  	t.Parallel()
   246  	s1 := TestServer(t, nil)
   247  	defer s1.Shutdown()
   248  
   249  	conf := func(c *Config) {
   250  		c.DevDisableBootstrap = true
   251  	}
   252  	s2 := TestServer(t, conf)
   253  	defer s2.Shutdown()
   254  
   255  	s3 := TestServer(t, conf)
   256  	defer s3.Shutdown()
   257  
   258  	s4 := TestServer(t, conf)
   259  	defer s4.Shutdown()
   260  
   261  	servers := []*Server{s1, s2, s3}
   262  
   263  	// Join the servers to s1
   264  	TestJoin(t, s1, s2, s3)
   265  
   266  	for _, s := range servers {
   267  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   268  	}
   269  
   270  	testutil.WaitForLeader(t, s1.RPC)
   271  
   272  	// Add s4 to peers directly
   273  	addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port)
   274  	future := s1.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0)
   275  	if err := future.Error(); err != nil {
   276  		t.Fatal(err)
   277  	}
   278  
   279  	// Verify we have 4 peers
   280  	peers, err := s1.numPeers()
   281  	if err != nil {
   282  		t.Fatal(err)
   283  	}
   284  	if peers != 4 {
   285  		t.Fatalf("bad: %v", peers)
   286  	}
   287  
   288  	// Wait for s4 to be removed
   289  	for _, s := range []*Server{s1, s2, s3} {
   290  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   291  	}
   292  }
   293  
   294  func TestAutopilot_PromoteNonVoter(t *testing.T) {
   295  	t.Parallel()
   296  	s1 := TestServer(t, func(c *Config) {
   297  		c.RaftConfig.ProtocolVersion = 3
   298  	})
   299  	defer s1.Shutdown()
   300  	codec := rpcClient(t, s1)
   301  	defer codec.Close()
   302  	testutil.WaitForLeader(t, s1.RPC)
   303  
   304  	s2 := TestServer(t, func(c *Config) {
   305  		c.DevDisableBootstrap = true
   306  		c.RaftConfig.ProtocolVersion = 3
   307  	})
   308  	defer s2.Shutdown()
   309  	TestJoin(t, s1, s2)
   310  
   311  	// Make sure we see it as a nonvoter initially. We wait until half
   312  	// the stabilization period has passed.
   313  	retry.Run(t, func(r *retry.R) {
   314  		future := s1.raft.GetConfiguration()
   315  		if err := future.Error(); err != nil {
   316  			r.Fatal(err)
   317  		}
   318  
   319  		servers := future.Configuration().Servers
   320  		if len(servers) != 2 {
   321  			r.Fatalf("bad: %v", servers)
   322  		}
   323  		if servers[1].Suffrage != raft.Nonvoter {
   324  			r.Fatalf("bad: %v", servers)
   325  		}
   326  		health := s1.autopilot.GetServerHealth(string(servers[1].ID))
   327  		if health == nil {
   328  			r.Fatalf("nil health, %v", s1.autopilot.GetClusterHealth())
   329  		}
   330  		if !health.Healthy {
   331  			r.Fatalf("bad: %v", health)
   332  		}
   333  		if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
   334  			r.Fatal("stable period not elapsed")
   335  		}
   336  	})
   337  
   338  	// Make sure it ends up as a voter.
   339  	retry.Run(t, func(r *retry.R) {
   340  		future := s1.raft.GetConfiguration()
   341  		if err := future.Error(); err != nil {
   342  			r.Fatal(err)
   343  		}
   344  
   345  		servers := future.Configuration().Servers
   346  		if len(servers) != 2 {
   347  			r.Fatalf("bad: %v", servers)
   348  		}
   349  		if servers[1].Suffrage != raft.Voter {
   350  			r.Fatalf("bad: %v", servers)
   351  		}
   352  	})
   353  }