github.com/hernad/nomad@v1.6.112/nomad/autopilot_test.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "fmt" 8 "testing" 9 "time" 10 11 "github.com/hashicorp/raft" 12 autopilot "github.com/hashicorp/raft-autopilot" 13 "github.com/hashicorp/serf/serf" 14 "github.com/shoenig/test/must" 15 16 "github.com/hernad/nomad/ci" 17 "github.com/hernad/nomad/testutil" 18 ) 19 20 var _ autopilot.ApplicationIntegration = (*AutopilotDelegate)(nil) 21 22 // wantPeers determines whether the server has the given 23 // number of voting raft peers. 24 func wantPeers(s *Server, peers int) error { 25 future := s.raft.GetConfiguration() 26 if err := future.Error(); err != nil { 27 return err 28 } 29 30 var n int 31 for _, server := range future.Configuration().Servers { 32 if server.Suffrage == raft.Voter { 33 n++ 34 } 35 } 36 37 if got, want := n, peers; got != want { 38 return fmt.Errorf("server %v: got %d peers want %d\n\tservers: %#+v", s.config.NodeName, got, want, future.Configuration().Servers) 39 } 40 return nil 41 } 42 43 func TestAutopilot_CleanupDeadServer(t *testing.T) { 44 ci.Parallel(t) 45 46 conf := func(c *Config) { 47 c.NumSchedulers = 0 // reduces test log noise 48 c.BootstrapExpect = 3 49 c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(3) 50 } 51 52 s1, cleanupS1 := TestServer(t, conf) 53 defer cleanupS1() 54 55 s2, cleanupS2 := TestServer(t, conf) 56 defer cleanupS2() 57 58 s3, cleanupS3 := TestServer(t, conf) 59 defer cleanupS3() 60 61 servers := []*Server{s1, s2, s3} 62 TestJoin(t, servers...) 63 64 t.Logf("waiting for initial stable cluster") 65 waitForStableLeadership(t, servers) 66 67 s4, cleanupS4 := TestServer(t, conf) 68 defer cleanupS4() 69 70 // Kill a non-leader server 71 killedIdx := 0 72 for i, s := range servers { 73 if !s.IsLeader() { 74 killedIdx = i 75 t.Logf("killing a server (index %d)", killedIdx) 76 s.Shutdown() 77 break 78 } 79 } 80 81 t.Logf("waiting for server loss to be detected") 82 testutil.WaitForResultUntil(10*time.Second, func() (bool, error) { 83 for i, s := range servers { 84 alive := 0 85 if i == killedIdx { 86 // Skip shutdown server 87 continue 88 } 89 for _, m := range s.Members() { 90 if m.Status == serf.StatusAlive { 91 alive++ 92 } 93 } 94 95 if alive != 2 { 96 return false, fmt.Errorf("expected 2 alive servers but found %v", alive) 97 } 98 } 99 return true, nil 100 }, func(err error) { must.NoError(t, err) }) 101 102 // Join the new server 103 servers[killedIdx] = s4 104 t.Logf("adding server s4") 105 TestJoin(t, servers...) 106 107 t.Logf("waiting for dead server to be removed") 108 waitForStableLeadership(t, servers) 109 } 110 111 func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) { 112 ci.Parallel(t) 113 114 conf := func(c *Config) { 115 c.NumSchedulers = 0 // reduces test log noise 116 c.BootstrapExpect = 5 117 } 118 119 s1, cleanupS1 := TestServer(t, conf) 120 defer cleanupS1() 121 122 s2, cleanupS2 := TestServer(t, conf) 123 defer cleanupS2() 124 125 s3, cleanupS3 := TestServer(t, conf) 126 defer cleanupS3() 127 128 s4, cleanupS4 := TestServer(t, conf) 129 defer cleanupS4() 130 131 s5, cleanupS5 := TestServer(t, conf) 132 defer cleanupS5() 133 134 servers := []*Server{s1, s2, s3, s4, s5} 135 TestJoin(t, servers...) 136 137 t.Logf("waiting for initial stable cluster") 138 waitForStableLeadership(t, servers) 139 140 t.Logf("killing a non-leader server") 141 if leader := waitForStableLeadership(t, servers); leader == s4 { 142 s1, s4 = s4, s1 143 } 144 s4.Shutdown() 145 146 t.Logf("waiting for dead peer to be removed") 147 servers = []*Server{s1, s2, s3, s5} 148 waitForStableLeadership(t, servers) 149 } 150 151 func TestAutopilot_RollingUpdate(t *testing.T) { 152 ci.Parallel(t) 153 154 conf := func(c *Config) { 155 c.NumSchedulers = 0 // reduces test log noise 156 c.BootstrapExpect = 3 157 c.RaftConfig.ProtocolVersion = 3 158 } 159 160 s1, cleanupS1 := TestServer(t, conf) 161 defer cleanupS1() 162 163 s2, cleanupS2 := TestServer(t, conf) 164 defer cleanupS2() 165 166 s3, cleanupS3 := TestServer(t, conf) 167 defer cleanupS3() 168 169 servers := []*Server{s1, s2, s3} 170 TestJoin(t, s1, s2, s3) 171 172 t.Logf("waiting for initial stable cluster") 173 waitForStableLeadership(t, servers) 174 175 // Add one more server like we are doing a rolling update. 176 t.Logf("adding server s4") 177 s4, cleanupS4 := TestServer(t, conf) 178 defer cleanupS4() 179 TestJoin(t, s1, s4) 180 181 // Wait for s4 to stabilize and get promoted to a voter 182 t.Logf("waiting for s4 to stabilize and be promoted") 183 servers = append(servers, s4) 184 waitForStableLeadership(t, servers) 185 186 // Now kill one of the "old" nodes like we are doing a rolling update. 187 t.Logf("shutting down server s3") 188 s3.Shutdown() 189 190 // Wait for s3 to be removed and the cluster to stablize. 191 t.Logf("waiting for cluster to stabilize") 192 servers = []*Server{s1, s2, s4} 193 waitForStableLeadership(t, servers) 194 } 195 196 func TestAutopilot_MultiRegion(t *testing.T) { 197 ci.Parallel(t) 198 199 conf := func(c *Config) { 200 c.NumSchedulers = 0 // reduces test log noise 201 c.BootstrapExpect = 3 202 } 203 s1, cleanupS1 := TestServer(t, conf) 204 defer cleanupS1() 205 206 s2, cleanupS2 := TestServer(t, conf) 207 defer cleanupS2() 208 209 s3, cleanupS3 := TestServer(t, conf) 210 defer cleanupS3() 211 212 // federated regions should not be considered raft peers or show up in the 213 // known servers list 214 s4, cleanupS4 := TestServer(t, func(c *Config) { 215 c.BootstrapExpect = 0 216 c.Region = "other" 217 }) 218 defer cleanupS4() 219 220 servers := []*Server{s1, s2, s3} 221 TestJoin(t, s1, s2, s3, s4) 222 223 t.Logf("waiting for initial stable cluster") 224 waitForStableLeadership(t, servers) 225 226 apDelegate := &AutopilotDelegate{s3} 227 known := apDelegate.KnownServers() 228 must.Eq(t, 3, len(known)) 229 230 } 231 232 func TestAutopilot_CleanupStaleRaftServer(t *testing.T) { 233 ci.Parallel(t) 234 235 conf := func(c *Config) { 236 c.NumSchedulers = 0 // reduces test log noise 237 c.BootstrapExpect = 3 238 } 239 s1, cleanupS1 := TestServer(t, conf) 240 defer cleanupS1() 241 242 s2, cleanupS2 := TestServer(t, conf) 243 defer cleanupS2() 244 245 s3, cleanupS3 := TestServer(t, conf) 246 defer cleanupS3() 247 248 s4, cleanupS4 := TestServer(t, func(c *Config) { 249 c.BootstrapExpect = 0 250 }) 251 defer cleanupS4() 252 253 servers := []*Server{s1, s2, s3} 254 TestJoin(t, s1, s2, s3) 255 256 t.Logf("waiting for initial stable cluster") 257 leader := waitForStableLeadership(t, servers) 258 259 t.Logf("adding server s4 to peers directly") 260 addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port) 261 future := leader.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0) 262 if err := future.Error(); err != nil { 263 t.Fatal(err) 264 } 265 266 t.Logf("waiting for 4th server to be removed") 267 waitForStableLeadership(t, servers) 268 } 269 270 func TestAutopilot_PromoteNonVoter(t *testing.T) { 271 ci.Parallel(t) 272 273 s1, cleanupS1 := TestServer(t, func(c *Config) { 274 c.NumSchedulers = 0 // reduces test log noise 275 c.RaftConfig.ProtocolVersion = 3 276 }) 277 defer cleanupS1() 278 codec := rpcClient(t, s1) 279 defer codec.Close() 280 testutil.WaitForLeader(t, s1.RPC) 281 282 s2, cleanupS2 := TestServer(t, func(c *Config) { 283 c.NumSchedulers = 0 // reduces test log noise 284 c.BootstrapExpect = 0 285 c.RaftConfig.ProtocolVersion = 3 286 }) 287 defer cleanupS2() 288 TestJoin(t, s1, s2) 289 290 // Note: we can't reliably detect that the server is initially a non-voter, 291 // because it can transition too quickly for the test setup to detect, 292 // especially in low-resource environments like CI. We'll assume that 293 // happens correctly here and only test that it transitions to become a 294 // voter. 295 testutil.WaitForResultUntil(10*time.Second, func() (bool, error) { 296 future := s1.raft.GetConfiguration() 297 if err := future.Error(); err != nil { 298 return false, err 299 } 300 servers := future.Configuration().Servers 301 if len(servers) != 2 { 302 return false, fmt.Errorf("expected 2 servers, got: %v", servers) 303 } 304 if servers[1].Suffrage != raft.Voter { 305 return false, fmt.Errorf("expected server to be voter: %v", servers) 306 } 307 return true, nil 308 }, func(err error) { must.NoError(t, err) }) 309 310 }