github.com/hernad/nomad@v1.6.112/nomad/autopilot_test.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package nomad
     5  
     6  import (
     7  	"fmt"
     8  	"testing"
     9  	"time"
    10  
    11  	"github.com/hashicorp/raft"
    12  	autopilot "github.com/hashicorp/raft-autopilot"
    13  	"github.com/hashicorp/serf/serf"
    14  	"github.com/shoenig/test/must"
    15  
    16  	"github.com/hernad/nomad/ci"
    17  	"github.com/hernad/nomad/testutil"
    18  )
    19  
    20  var _ autopilot.ApplicationIntegration = (*AutopilotDelegate)(nil)
    21  
    22  // wantPeers determines whether the server has the given
    23  // number of voting raft peers.
    24  func wantPeers(s *Server, peers int) error {
    25  	future := s.raft.GetConfiguration()
    26  	if err := future.Error(); err != nil {
    27  		return err
    28  	}
    29  
    30  	var n int
    31  	for _, server := range future.Configuration().Servers {
    32  		if server.Suffrage == raft.Voter {
    33  			n++
    34  		}
    35  	}
    36  
    37  	if got, want := n, peers; got != want {
    38  		return fmt.Errorf("server %v: got %d peers want %d\n\tservers: %#+v", s.config.NodeName, got, want, future.Configuration().Servers)
    39  	}
    40  	return nil
    41  }
    42  
    43  func TestAutopilot_CleanupDeadServer(t *testing.T) {
    44  	ci.Parallel(t)
    45  
    46  	conf := func(c *Config) {
    47  		c.NumSchedulers = 0 // reduces test log noise
    48  		c.BootstrapExpect = 3
    49  		c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(3)
    50  	}
    51  
    52  	s1, cleanupS1 := TestServer(t, conf)
    53  	defer cleanupS1()
    54  
    55  	s2, cleanupS2 := TestServer(t, conf)
    56  	defer cleanupS2()
    57  
    58  	s3, cleanupS3 := TestServer(t, conf)
    59  	defer cleanupS3()
    60  
    61  	servers := []*Server{s1, s2, s3}
    62  	TestJoin(t, servers...)
    63  
    64  	t.Logf("waiting for initial stable cluster")
    65  	waitForStableLeadership(t, servers)
    66  
    67  	s4, cleanupS4 := TestServer(t, conf)
    68  	defer cleanupS4()
    69  
    70  	// Kill a non-leader server
    71  	killedIdx := 0
    72  	for i, s := range servers {
    73  		if !s.IsLeader() {
    74  			killedIdx = i
    75  			t.Logf("killing a server (index %d)", killedIdx)
    76  			s.Shutdown()
    77  			break
    78  		}
    79  	}
    80  
    81  	t.Logf("waiting for server loss to be detected")
    82  	testutil.WaitForResultUntil(10*time.Second, func() (bool, error) {
    83  		for i, s := range servers {
    84  			alive := 0
    85  			if i == killedIdx {
    86  				// Skip shutdown server
    87  				continue
    88  			}
    89  			for _, m := range s.Members() {
    90  				if m.Status == serf.StatusAlive {
    91  					alive++
    92  				}
    93  			}
    94  
    95  			if alive != 2 {
    96  				return false, fmt.Errorf("expected 2 alive servers but found %v", alive)
    97  			}
    98  		}
    99  		return true, nil
   100  	}, func(err error) { must.NoError(t, err) })
   101  
   102  	// Join the new server
   103  	servers[killedIdx] = s4
   104  	t.Logf("adding server s4")
   105  	TestJoin(t, servers...)
   106  
   107  	t.Logf("waiting for dead server to be removed")
   108  	waitForStableLeadership(t, servers)
   109  }
   110  
   111  func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
   112  	ci.Parallel(t)
   113  
   114  	conf := func(c *Config) {
   115  		c.NumSchedulers = 0 // reduces test log noise
   116  		c.BootstrapExpect = 5
   117  	}
   118  
   119  	s1, cleanupS1 := TestServer(t, conf)
   120  	defer cleanupS1()
   121  
   122  	s2, cleanupS2 := TestServer(t, conf)
   123  	defer cleanupS2()
   124  
   125  	s3, cleanupS3 := TestServer(t, conf)
   126  	defer cleanupS3()
   127  
   128  	s4, cleanupS4 := TestServer(t, conf)
   129  	defer cleanupS4()
   130  
   131  	s5, cleanupS5 := TestServer(t, conf)
   132  	defer cleanupS5()
   133  
   134  	servers := []*Server{s1, s2, s3, s4, s5}
   135  	TestJoin(t, servers...)
   136  
   137  	t.Logf("waiting for initial stable cluster")
   138  	waitForStableLeadership(t, servers)
   139  
   140  	t.Logf("killing a non-leader server")
   141  	if leader := waitForStableLeadership(t, servers); leader == s4 {
   142  		s1, s4 = s4, s1
   143  	}
   144  	s4.Shutdown()
   145  
   146  	t.Logf("waiting for dead peer to be removed")
   147  	servers = []*Server{s1, s2, s3, s5}
   148  	waitForStableLeadership(t, servers)
   149  }
   150  
   151  func TestAutopilot_RollingUpdate(t *testing.T) {
   152  	ci.Parallel(t)
   153  
   154  	conf := func(c *Config) {
   155  		c.NumSchedulers = 0 // reduces test log noise
   156  		c.BootstrapExpect = 3
   157  		c.RaftConfig.ProtocolVersion = 3
   158  	}
   159  
   160  	s1, cleanupS1 := TestServer(t, conf)
   161  	defer cleanupS1()
   162  
   163  	s2, cleanupS2 := TestServer(t, conf)
   164  	defer cleanupS2()
   165  
   166  	s3, cleanupS3 := TestServer(t, conf)
   167  	defer cleanupS3()
   168  
   169  	servers := []*Server{s1, s2, s3}
   170  	TestJoin(t, s1, s2, s3)
   171  
   172  	t.Logf("waiting for initial stable cluster")
   173  	waitForStableLeadership(t, servers)
   174  
   175  	// Add one more server like we are doing a rolling update.
   176  	t.Logf("adding server s4")
   177  	s4, cleanupS4 := TestServer(t, conf)
   178  	defer cleanupS4()
   179  	TestJoin(t, s1, s4)
   180  
   181  	// Wait for s4 to stabilize and get promoted to a voter
   182  	t.Logf("waiting for s4 to stabilize and be promoted")
   183  	servers = append(servers, s4)
   184  	waitForStableLeadership(t, servers)
   185  
   186  	// Now kill one of the "old" nodes like we are doing a rolling update.
   187  	t.Logf("shutting down server s3")
   188  	s3.Shutdown()
   189  
   190  	// Wait for s3 to be removed and the cluster to stablize.
   191  	t.Logf("waiting for cluster to stabilize")
   192  	servers = []*Server{s1, s2, s4}
   193  	waitForStableLeadership(t, servers)
   194  }
   195  
   196  func TestAutopilot_MultiRegion(t *testing.T) {
   197  	ci.Parallel(t)
   198  
   199  	conf := func(c *Config) {
   200  		c.NumSchedulers = 0 // reduces test log noise
   201  		c.BootstrapExpect = 3
   202  	}
   203  	s1, cleanupS1 := TestServer(t, conf)
   204  	defer cleanupS1()
   205  
   206  	s2, cleanupS2 := TestServer(t, conf)
   207  	defer cleanupS2()
   208  
   209  	s3, cleanupS3 := TestServer(t, conf)
   210  	defer cleanupS3()
   211  
   212  	// federated regions should not be considered raft peers or show up in the
   213  	// known servers list
   214  	s4, cleanupS4 := TestServer(t, func(c *Config) {
   215  		c.BootstrapExpect = 0
   216  		c.Region = "other"
   217  	})
   218  	defer cleanupS4()
   219  
   220  	servers := []*Server{s1, s2, s3}
   221  	TestJoin(t, s1, s2, s3, s4)
   222  
   223  	t.Logf("waiting for initial stable cluster")
   224  	waitForStableLeadership(t, servers)
   225  
   226  	apDelegate := &AutopilotDelegate{s3}
   227  	known := apDelegate.KnownServers()
   228  	must.Eq(t, 3, len(known))
   229  
   230  }
   231  
   232  func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
   233  	ci.Parallel(t)
   234  
   235  	conf := func(c *Config) {
   236  		c.NumSchedulers = 0 // reduces test log noise
   237  		c.BootstrapExpect = 3
   238  	}
   239  	s1, cleanupS1 := TestServer(t, conf)
   240  	defer cleanupS1()
   241  
   242  	s2, cleanupS2 := TestServer(t, conf)
   243  	defer cleanupS2()
   244  
   245  	s3, cleanupS3 := TestServer(t, conf)
   246  	defer cleanupS3()
   247  
   248  	s4, cleanupS4 := TestServer(t, func(c *Config) {
   249  		c.BootstrapExpect = 0
   250  	})
   251  	defer cleanupS4()
   252  
   253  	servers := []*Server{s1, s2, s3}
   254  	TestJoin(t, s1, s2, s3)
   255  
   256  	t.Logf("waiting for initial stable cluster")
   257  	leader := waitForStableLeadership(t, servers)
   258  
   259  	t.Logf("adding server s4 to peers directly")
   260  	addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port)
   261  	future := leader.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0)
   262  	if err := future.Error(); err != nil {
   263  		t.Fatal(err)
   264  	}
   265  
   266  	t.Logf("waiting for 4th server to be removed")
   267  	waitForStableLeadership(t, servers)
   268  }
   269  
   270  func TestAutopilot_PromoteNonVoter(t *testing.T) {
   271  	ci.Parallel(t)
   272  
   273  	s1, cleanupS1 := TestServer(t, func(c *Config) {
   274  		c.NumSchedulers = 0 // reduces test log noise
   275  		c.RaftConfig.ProtocolVersion = 3
   276  	})
   277  	defer cleanupS1()
   278  	codec := rpcClient(t, s1)
   279  	defer codec.Close()
   280  	testutil.WaitForLeader(t, s1.RPC)
   281  
   282  	s2, cleanupS2 := TestServer(t, func(c *Config) {
   283  		c.NumSchedulers = 0 // reduces test log noise
   284  		c.BootstrapExpect = 0
   285  		c.RaftConfig.ProtocolVersion = 3
   286  	})
   287  	defer cleanupS2()
   288  	TestJoin(t, s1, s2)
   289  
   290  	// Note: we can't reliably detect that the server is initially a non-voter,
   291  	// because it can transition too quickly for the test setup to detect,
   292  	// especially in low-resource environments like CI. We'll assume that
   293  	// happens correctly here and only test that it transitions to become a
   294  	// voter.
   295  	testutil.WaitForResultUntil(10*time.Second, func() (bool, error) {
   296  		future := s1.raft.GetConfiguration()
   297  		if err := future.Error(); err != nil {
   298  			return false, err
   299  		}
   300  		servers := future.Configuration().Servers
   301  		if len(servers) != 2 {
   302  			return false, fmt.Errorf("expected 2 servers, got: %v", servers)
   303  		}
   304  		if servers[1].Suffrage != raft.Voter {
   305  			return false, fmt.Errorf("expected server to be voter: %v", servers)
   306  		}
   307  		return true, nil
   308  	}, func(err error) { must.NoError(t, err) })
   309  
   310  }