github.com/adityamillind98/nomad@v0.11.8/nomad/autopilot_test.go (about) 1 package nomad 2 3 import ( 4 "testing" 5 "time" 6 7 "fmt" 8 9 "github.com/hashicorp/consul/agent/consul/autopilot" 10 "github.com/hashicorp/consul/sdk/testutil/retry" 11 "github.com/hashicorp/nomad/testutil" 12 "github.com/hashicorp/raft" 13 "github.com/hashicorp/serf/serf" 14 ) 15 16 // wantPeers determines whether the server has the given 17 // number of voting raft peers. 18 func wantPeers(s *Server, peers int) error { 19 future := s.raft.GetConfiguration() 20 if err := future.Error(); err != nil { 21 return err 22 } 23 24 n := autopilot.NumPeers(future.Configuration()) 25 if got, want := n, peers; got != want { 26 return fmt.Errorf("got %d peers want %d", got, want) 27 } 28 return nil 29 } 30 31 // wantRaft determines if the servers have all of each other in their 32 // Raft configurations, 33 func wantRaft(servers []*Server) error { 34 // Make sure all the servers are represented in the Raft config, 35 // and that there are no extras. 36 verifyRaft := func(c raft.Configuration) error { 37 want := make(map[raft.ServerID]bool) 38 for _, s := range servers { 39 want[s.config.RaftConfig.LocalID] = true 40 } 41 42 found := make([]raft.ServerID, 0, len(c.Servers)) 43 for _, s := range c.Servers { 44 found = append(found, s.ID) 45 if !want[s.ID] { 46 return fmt.Errorf("don't want %q", s.ID) 47 } 48 delete(want, s.ID) 49 } 50 51 if len(want) > 0 { 52 return fmt.Errorf("didn't find %v in %#+v", want, found) 53 } 54 return nil 55 } 56 57 for _, s := range servers { 58 future := s.raft.GetConfiguration() 59 if err := future.Error(); err != nil { 60 return err 61 } 62 if err := verifyRaft(future.Configuration()); err != nil { 63 return err 64 } 65 } 66 return nil 67 } 68 69 func TestAutopilot_CleanupDeadServer(t *testing.T) { 70 t.Parallel() 71 for i := 1; i <= 3; i++ { 72 testCleanupDeadServer(t, i) 73 } 74 } 75 76 func testCleanupDeadServer(t *testing.T, raftVersion int) { 77 conf := func(c *Config) { 78 c.BootstrapExpect = 3 79 c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion) 80 } 81 82 s1, cleanupS1 := TestServer(t, conf) 83 defer cleanupS1() 84 85 s2, cleanupS2 := TestServer(t, conf) 86 defer cleanupS2() 87 88 s3, cleanupS3 := TestServer(t, conf) 89 defer cleanupS3() 90 91 servers := []*Server{s1, s2, s3} 92 93 // Try to join 94 TestJoin(t, s1, s2, s3) 95 96 for _, s := range servers { 97 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 98 } 99 100 // Bring up a new server 101 s4, cleanupS4 := TestServer(t, conf) 102 defer cleanupS4() 103 104 // Kill a non-leader server 105 s3.Shutdown() 106 retry.Run(t, func(r *retry.R) { 107 alive := 0 108 for _, m := range s1.Members() { 109 if m.Status == serf.StatusAlive { 110 alive++ 111 } 112 } 113 if alive != 2 { 114 r.Fatal(nil) 115 } 116 }) 117 118 // Join the new server 119 TestJoin(t, s1, s4) 120 servers[2] = s4 121 122 // Make sure the dead server is removed and we're back to 3 total peers 123 for _, s := range servers { 124 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 125 } 126 } 127 128 func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) { 129 t.Parallel() 130 131 conf := func(c *Config) { 132 c.BootstrapExpect = 5 133 } 134 135 s1, cleanupS1 := TestServer(t, conf) 136 defer cleanupS1() 137 138 s2, cleanupS2 := TestServer(t, conf) 139 defer cleanupS2() 140 141 s3, cleanupS3 := TestServer(t, conf) 142 defer cleanupS3() 143 144 s4, cleanupS4 := TestServer(t, conf) 145 defer cleanupS4() 146 147 s5, cleanupS5 := TestServer(t, conf) 148 defer cleanupS5() 149 150 servers := []*Server{s1, s2, s3, s4, s5} 151 152 // Join the servers to s1, and wait until they are all promoted to 153 // voters. 154 TestJoin(t, s1, servers[1:]...) 155 retry.Run(t, func(r *retry.R) { 156 r.Check(wantRaft(servers)) 157 for _, s := range servers { 158 r.Check(wantPeers(s, 5)) 159 } 160 }) 161 162 // Kill a non-leader server 163 s4.Shutdown() 164 165 // Should be removed from the peers automatically 166 servers = []*Server{s1, s2, s3, s5} 167 retry.Run(t, func(r *retry.R) { 168 r.Check(wantRaft(servers)) 169 for _, s := range servers { 170 r.Check(wantPeers(s, 4)) 171 } 172 }) 173 } 174 175 func TestAutopilot_RollingUpdate(t *testing.T) { 176 t.Parallel() 177 178 conf := func(c *Config) { 179 c.BootstrapExpect = 3 180 c.RaftConfig.ProtocolVersion = 3 181 } 182 183 s1, cleanupS1 := TestServer(t, conf) 184 defer cleanupS1() 185 186 s2, cleanupS2 := TestServer(t, conf) 187 defer cleanupS2() 188 189 s3, cleanupS3 := TestServer(t, conf) 190 defer cleanupS3() 191 192 // Join the servers to s1, and wait until they are all promoted to 193 // voters. 194 servers := []*Server{s1, s2, s3} 195 TestJoin(t, s1, s2, s3) 196 retry.Run(t, func(r *retry.R) { 197 r.Check(wantRaft(servers)) 198 for _, s := range servers { 199 r.Check(wantPeers(s, 3)) 200 } 201 }) 202 203 // Add one more server like we are doing a rolling update. 204 t.Logf("adding server s4") 205 s4, cleanupS4 := TestServer(t, conf) 206 defer cleanupS4() 207 TestJoin(t, s1, s4) 208 209 servers = append(servers, s4) 210 retry.Run(t, func(r *retry.R) { 211 r.Check(wantRaft(servers)) 212 for _, s := range servers { 213 r.Check(wantPeers(s, 4)) 214 } 215 }) 216 217 // Now kill one of the "old" nodes like we are doing a rolling update. 218 t.Logf("shutting down server s3") 219 s3.Shutdown() 220 221 isVoter := func() bool { 222 future := s1.raft.GetConfiguration() 223 if err := future.Error(); err != nil { 224 t.Fatalf("err: %v", err) 225 } 226 for _, s := range future.Configuration().Servers { 227 if string(s.ID) == string(s4.config.NodeID) { 228 return s.Suffrage == raft.Voter 229 } 230 } 231 t.Fatalf("didn't find s4") 232 return false 233 } 234 235 t.Logf("waiting for s4 to stabalize and be promoted") 236 237 // Wait for s4 to stabilize, get promoted to a voter, and for s3 to be 238 // removed. 239 servers = []*Server{s1, s2, s4} 240 retry.Run(t, func(r *retry.R) { 241 r.Check(wantRaft(servers)) 242 for _, s := range servers { 243 r.Check(wantPeers(s, 3)) 244 } 245 if !isVoter() { 246 r.Fatalf("should be a voter") 247 } 248 }) 249 } 250 251 func TestAutopilot_CleanupStaleRaftServer(t *testing.T) { 252 t.Skip("TestAutopilot_CleanupDeadServer is very flaky, removing it for now") 253 t.Parallel() 254 255 conf := func(c *Config) { 256 c.BootstrapExpect = 3 257 } 258 s1, cleanupS1 := TestServer(t, conf) 259 defer cleanupS1() 260 261 s2, cleanupS2 := TestServer(t, conf) 262 defer cleanupS2() 263 264 s3, cleanupS3 := TestServer(t, conf) 265 defer cleanupS3() 266 267 s4, cleanupS4 := TestServer(t, func(c *Config) { 268 c.BootstrapExpect = 0 269 }) 270 defer cleanupS4() 271 272 servers := []*Server{s1, s2, s3} 273 274 // Join the servers to s1 275 TestJoin(t, s1, s2, s3) 276 277 leader := waitForStableLeadership(t, servers) 278 279 // Add s4 to peers directly 280 addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port) 281 future := leader.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0) 282 if err := future.Error(); err != nil { 283 t.Fatal(err) 284 } 285 286 // Verify we have 4 peers 287 peers, err := s1.numPeers() 288 if err != nil { 289 t.Fatal(err) 290 } 291 if peers != 4 { 292 t.Fatalf("bad: %v", peers) 293 } 294 295 // Wait for s4 to be removed 296 for _, s := range []*Server{s1, s2, s3} { 297 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 298 } 299 } 300 301 func TestAutopilot_PromoteNonVoter(t *testing.T) { 302 t.Parallel() 303 304 s1, cleanupS1 := TestServer(t, func(c *Config) { 305 c.RaftConfig.ProtocolVersion = 3 306 }) 307 defer cleanupS1() 308 codec := rpcClient(t, s1) 309 defer codec.Close() 310 testutil.WaitForLeader(t, s1.RPC) 311 312 s2, cleanupS2 := TestServer(t, func(c *Config) { 313 c.BootstrapExpect = 0 314 c.RaftConfig.ProtocolVersion = 3 315 }) 316 defer cleanupS2() 317 TestJoin(t, s1, s2) 318 319 // Make sure we see it as a nonvoter initially. We wait until half 320 // the stabilization period has passed. 321 retry.Run(t, func(r *retry.R) { 322 future := s1.raft.GetConfiguration() 323 if err := future.Error(); err != nil { 324 r.Fatal(err) 325 } 326 327 servers := future.Configuration().Servers 328 if len(servers) != 2 { 329 r.Fatalf("bad: %v", servers) 330 } 331 if servers[1].Suffrage != raft.Nonvoter { 332 r.Fatalf("bad: %v", servers) 333 } 334 health := s1.autopilot.GetServerHealth(string(servers[1].ID)) 335 if health == nil { 336 r.Fatalf("nil health, %v", s1.autopilot.GetClusterHealth()) 337 } 338 if !health.Healthy { 339 r.Fatalf("bad: %v", health) 340 } 341 if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 { 342 r.Fatal("stable period not elapsed") 343 } 344 }) 345 346 // Make sure it ends up as a voter. 347 retry.Run(t, func(r *retry.R) { 348 future := s1.raft.GetConfiguration() 349 if err := future.Error(); err != nil { 350 r.Fatal(err) 351 } 352 353 servers := future.Configuration().Servers 354 if len(servers) != 2 { 355 r.Fatalf("bad: %v", servers) 356 } 357 if servers[1].Suffrage != raft.Voter { 358 r.Fatalf("bad: %v", servers) 359 } 360 }) 361 }