github.com/adityamillind98/nomad@v0.11.8/nomad/autopilot_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"testing"
     5  	"time"
     6  
     7  	"fmt"
     8  
     9  	"github.com/hashicorp/consul/agent/consul/autopilot"
    10  	"github.com/hashicorp/consul/sdk/testutil/retry"
    11  	"github.com/hashicorp/nomad/testutil"
    12  	"github.com/hashicorp/raft"
    13  	"github.com/hashicorp/serf/serf"
    14  )
    15  
    16  // wantPeers determines whether the server has the given
    17  // number of voting raft peers.
    18  func wantPeers(s *Server, peers int) error {
    19  	future := s.raft.GetConfiguration()
    20  	if err := future.Error(); err != nil {
    21  		return err
    22  	}
    23  
    24  	n := autopilot.NumPeers(future.Configuration())
    25  	if got, want := n, peers; got != want {
    26  		return fmt.Errorf("got %d peers want %d", got, want)
    27  	}
    28  	return nil
    29  }
    30  
    31  // wantRaft determines if the servers have all of each other in their
    32  // Raft configurations,
    33  func wantRaft(servers []*Server) error {
    34  	// Make sure all the servers are represented in the Raft config,
    35  	// and that there are no extras.
    36  	verifyRaft := func(c raft.Configuration) error {
    37  		want := make(map[raft.ServerID]bool)
    38  		for _, s := range servers {
    39  			want[s.config.RaftConfig.LocalID] = true
    40  		}
    41  
    42  		found := make([]raft.ServerID, 0, len(c.Servers))
    43  		for _, s := range c.Servers {
    44  			found = append(found, s.ID)
    45  			if !want[s.ID] {
    46  				return fmt.Errorf("don't want %q", s.ID)
    47  			}
    48  			delete(want, s.ID)
    49  		}
    50  
    51  		if len(want) > 0 {
    52  			return fmt.Errorf("didn't find %v in %#+v", want, found)
    53  		}
    54  		return nil
    55  	}
    56  
    57  	for _, s := range servers {
    58  		future := s.raft.GetConfiguration()
    59  		if err := future.Error(); err != nil {
    60  			return err
    61  		}
    62  		if err := verifyRaft(future.Configuration()); err != nil {
    63  			return err
    64  		}
    65  	}
    66  	return nil
    67  }
    68  
    69  func TestAutopilot_CleanupDeadServer(t *testing.T) {
    70  	t.Parallel()
    71  	for i := 1; i <= 3; i++ {
    72  		testCleanupDeadServer(t, i)
    73  	}
    74  }
    75  
    76  func testCleanupDeadServer(t *testing.T, raftVersion int) {
    77  	conf := func(c *Config) {
    78  		c.BootstrapExpect = 3
    79  		c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
    80  	}
    81  
    82  	s1, cleanupS1 := TestServer(t, conf)
    83  	defer cleanupS1()
    84  
    85  	s2, cleanupS2 := TestServer(t, conf)
    86  	defer cleanupS2()
    87  
    88  	s3, cleanupS3 := TestServer(t, conf)
    89  	defer cleanupS3()
    90  
    91  	servers := []*Server{s1, s2, s3}
    92  
    93  	// Try to join
    94  	TestJoin(t, s1, s2, s3)
    95  
    96  	for _, s := range servers {
    97  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
    98  	}
    99  
   100  	// Bring up a new server
   101  	s4, cleanupS4 := TestServer(t, conf)
   102  	defer cleanupS4()
   103  
   104  	// Kill a non-leader server
   105  	s3.Shutdown()
   106  	retry.Run(t, func(r *retry.R) {
   107  		alive := 0
   108  		for _, m := range s1.Members() {
   109  			if m.Status == serf.StatusAlive {
   110  				alive++
   111  			}
   112  		}
   113  		if alive != 2 {
   114  			r.Fatal(nil)
   115  		}
   116  	})
   117  
   118  	// Join the new server
   119  	TestJoin(t, s1, s4)
   120  	servers[2] = s4
   121  
   122  	// Make sure the dead server is removed and we're back to 3 total peers
   123  	for _, s := range servers {
   124  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   125  	}
   126  }
   127  
   128  func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
   129  	t.Parallel()
   130  
   131  	conf := func(c *Config) {
   132  		c.BootstrapExpect = 5
   133  	}
   134  
   135  	s1, cleanupS1 := TestServer(t, conf)
   136  	defer cleanupS1()
   137  
   138  	s2, cleanupS2 := TestServer(t, conf)
   139  	defer cleanupS2()
   140  
   141  	s3, cleanupS3 := TestServer(t, conf)
   142  	defer cleanupS3()
   143  
   144  	s4, cleanupS4 := TestServer(t, conf)
   145  	defer cleanupS4()
   146  
   147  	s5, cleanupS5 := TestServer(t, conf)
   148  	defer cleanupS5()
   149  
   150  	servers := []*Server{s1, s2, s3, s4, s5}
   151  
   152  	// Join the servers to s1, and wait until they are all promoted to
   153  	// voters.
   154  	TestJoin(t, s1, servers[1:]...)
   155  	retry.Run(t, func(r *retry.R) {
   156  		r.Check(wantRaft(servers))
   157  		for _, s := range servers {
   158  			r.Check(wantPeers(s, 5))
   159  		}
   160  	})
   161  
   162  	// Kill a non-leader server
   163  	s4.Shutdown()
   164  
   165  	// Should be removed from the peers automatically
   166  	servers = []*Server{s1, s2, s3, s5}
   167  	retry.Run(t, func(r *retry.R) {
   168  		r.Check(wantRaft(servers))
   169  		for _, s := range servers {
   170  			r.Check(wantPeers(s, 4))
   171  		}
   172  	})
   173  }
   174  
   175  func TestAutopilot_RollingUpdate(t *testing.T) {
   176  	t.Parallel()
   177  
   178  	conf := func(c *Config) {
   179  		c.BootstrapExpect = 3
   180  		c.RaftConfig.ProtocolVersion = 3
   181  	}
   182  
   183  	s1, cleanupS1 := TestServer(t, conf)
   184  	defer cleanupS1()
   185  
   186  	s2, cleanupS2 := TestServer(t, conf)
   187  	defer cleanupS2()
   188  
   189  	s3, cleanupS3 := TestServer(t, conf)
   190  	defer cleanupS3()
   191  
   192  	// Join the servers to s1, and wait until they are all promoted to
   193  	// voters.
   194  	servers := []*Server{s1, s2, s3}
   195  	TestJoin(t, s1, s2, s3)
   196  	retry.Run(t, func(r *retry.R) {
   197  		r.Check(wantRaft(servers))
   198  		for _, s := range servers {
   199  			r.Check(wantPeers(s, 3))
   200  		}
   201  	})
   202  
   203  	// Add one more server like we are doing a rolling update.
   204  	t.Logf("adding server s4")
   205  	s4, cleanupS4 := TestServer(t, conf)
   206  	defer cleanupS4()
   207  	TestJoin(t, s1, s4)
   208  
   209  	servers = append(servers, s4)
   210  	retry.Run(t, func(r *retry.R) {
   211  		r.Check(wantRaft(servers))
   212  		for _, s := range servers {
   213  			r.Check(wantPeers(s, 4))
   214  		}
   215  	})
   216  
   217  	// Now kill one of the "old" nodes like we are doing a rolling update.
   218  	t.Logf("shutting down server s3")
   219  	s3.Shutdown()
   220  
   221  	isVoter := func() bool {
   222  		future := s1.raft.GetConfiguration()
   223  		if err := future.Error(); err != nil {
   224  			t.Fatalf("err: %v", err)
   225  		}
   226  		for _, s := range future.Configuration().Servers {
   227  			if string(s.ID) == string(s4.config.NodeID) {
   228  				return s.Suffrage == raft.Voter
   229  			}
   230  		}
   231  		t.Fatalf("didn't find s4")
   232  		return false
   233  	}
   234  
   235  	t.Logf("waiting for s4 to stabalize and be promoted")
   236  
   237  	// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
   238  	// removed.
   239  	servers = []*Server{s1, s2, s4}
   240  	retry.Run(t, func(r *retry.R) {
   241  		r.Check(wantRaft(servers))
   242  		for _, s := range servers {
   243  			r.Check(wantPeers(s, 3))
   244  		}
   245  		if !isVoter() {
   246  			r.Fatalf("should be a voter")
   247  		}
   248  	})
   249  }
   250  
   251  func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
   252  	t.Skip("TestAutopilot_CleanupDeadServer is very flaky, removing it for now")
   253  	t.Parallel()
   254  
   255  	conf := func(c *Config) {
   256  		c.BootstrapExpect = 3
   257  	}
   258  	s1, cleanupS1 := TestServer(t, conf)
   259  	defer cleanupS1()
   260  
   261  	s2, cleanupS2 := TestServer(t, conf)
   262  	defer cleanupS2()
   263  
   264  	s3, cleanupS3 := TestServer(t, conf)
   265  	defer cleanupS3()
   266  
   267  	s4, cleanupS4 := TestServer(t, func(c *Config) {
   268  		c.BootstrapExpect = 0
   269  	})
   270  	defer cleanupS4()
   271  
   272  	servers := []*Server{s1, s2, s3}
   273  
   274  	// Join the servers to s1
   275  	TestJoin(t, s1, s2, s3)
   276  
   277  	leader := waitForStableLeadership(t, servers)
   278  
   279  	// Add s4 to peers directly
   280  	addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port)
   281  	future := leader.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0)
   282  	if err := future.Error(); err != nil {
   283  		t.Fatal(err)
   284  	}
   285  
   286  	// Verify we have 4 peers
   287  	peers, err := s1.numPeers()
   288  	if err != nil {
   289  		t.Fatal(err)
   290  	}
   291  	if peers != 4 {
   292  		t.Fatalf("bad: %v", peers)
   293  	}
   294  
   295  	// Wait for s4 to be removed
   296  	for _, s := range []*Server{s1, s2, s3} {
   297  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   298  	}
   299  }
   300  
   301  func TestAutopilot_PromoteNonVoter(t *testing.T) {
   302  	t.Parallel()
   303  
   304  	s1, cleanupS1 := TestServer(t, func(c *Config) {
   305  		c.RaftConfig.ProtocolVersion = 3
   306  	})
   307  	defer cleanupS1()
   308  	codec := rpcClient(t, s1)
   309  	defer codec.Close()
   310  	testutil.WaitForLeader(t, s1.RPC)
   311  
   312  	s2, cleanupS2 := TestServer(t, func(c *Config) {
   313  		c.BootstrapExpect = 0
   314  		c.RaftConfig.ProtocolVersion = 3
   315  	})
   316  	defer cleanupS2()
   317  	TestJoin(t, s1, s2)
   318  
   319  	// Make sure we see it as a nonvoter initially. We wait until half
   320  	// the stabilization period has passed.
   321  	retry.Run(t, func(r *retry.R) {
   322  		future := s1.raft.GetConfiguration()
   323  		if err := future.Error(); err != nil {
   324  			r.Fatal(err)
   325  		}
   326  
   327  		servers := future.Configuration().Servers
   328  		if len(servers) != 2 {
   329  			r.Fatalf("bad: %v", servers)
   330  		}
   331  		if servers[1].Suffrage != raft.Nonvoter {
   332  			r.Fatalf("bad: %v", servers)
   333  		}
   334  		health := s1.autopilot.GetServerHealth(string(servers[1].ID))
   335  		if health == nil {
   336  			r.Fatalf("nil health, %v", s1.autopilot.GetClusterHealth())
   337  		}
   338  		if !health.Healthy {
   339  			r.Fatalf("bad: %v", health)
   340  		}
   341  		if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
   342  			r.Fatal("stable period not elapsed")
   343  		}
   344  	})
   345  
   346  	// Make sure it ends up as a voter.
   347  	retry.Run(t, func(r *retry.R) {
   348  		future := s1.raft.GetConfiguration()
   349  		if err := future.Error(); err != nil {
   350  			r.Fatal(err)
   351  		}
   352  
   353  		servers := future.Configuration().Servers
   354  		if len(servers) != 2 {
   355  			r.Fatalf("bad: %v", servers)
   356  		}
   357  		if servers[1].Suffrage != raft.Voter {
   358  			r.Fatalf("bad: %v", servers)
   359  		}
   360  	})
   361  }