github.com/manicqin/nomad@v0.9.5/nomad/autopilot_test.go (about) 1 package nomad 2 3 import ( 4 "testing" 5 "time" 6 7 "fmt" 8 9 "github.com/hashicorp/consul/agent/consul/autopilot" 10 "github.com/hashicorp/consul/testutil/retry" 11 "github.com/hashicorp/nomad/testutil" 12 "github.com/hashicorp/raft" 13 "github.com/hashicorp/serf/serf" 14 ) 15 16 // wantPeers determines whether the server has the given 17 // number of voting raft peers. 18 func wantPeers(s *Server, peers int) error { 19 future := s.raft.GetConfiguration() 20 if err := future.Error(); err != nil { 21 return err 22 } 23 24 n := autopilot.NumPeers(future.Configuration()) 25 if got, want := n, peers; got != want { 26 return fmt.Errorf("got %d peers want %d", got, want) 27 } 28 return nil 29 } 30 31 // wantRaft determines if the servers have all of each other in their 32 // Raft configurations, 33 func wantRaft(servers []*Server) error { 34 // Make sure all the servers are represented in the Raft config, 35 // and that there are no extras. 36 verifyRaft := func(c raft.Configuration) error { 37 want := make(map[raft.ServerID]bool) 38 for _, s := range servers { 39 want[s.config.RaftConfig.LocalID] = true 40 } 41 42 for _, s := range c.Servers { 43 if !want[s.ID] { 44 return fmt.Errorf("don't want %q", s.ID) 45 } 46 delete(want, s.ID) 47 } 48 49 if len(want) > 0 { 50 return fmt.Errorf("didn't find %v", want) 51 } 52 return nil 53 } 54 55 for _, s := range servers { 56 future := s.raft.GetConfiguration() 57 if err := future.Error(); err != nil { 58 return err 59 } 60 if err := verifyRaft(future.Configuration()); err != nil { 61 return err 62 } 63 } 64 return nil 65 } 66 67 func TestAutopilot_CleanupDeadServer(t *testing.T) { 68 t.Parallel() 69 for i := 1; i <= 3; i++ { 70 testCleanupDeadServer(t, i) 71 } 72 } 73 74 func testCleanupDeadServer(t *testing.T, raftVersion int) { 75 conf := func(c *Config) { 76 c.DevDisableBootstrap = true 77 c.BootstrapExpect = 3 78 c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion) 79 } 80 81 s1, cleanupS1 := TestServer(t, conf) 82 defer cleanupS1() 83 84 s2, cleanupS2 := TestServer(t, conf) 85 defer cleanupS2() 86 87 s3, cleanupS3 := TestServer(t, conf) 88 defer cleanupS3() 89 90 servers := []*Server{s1, s2, s3} 91 92 // Try to join 93 TestJoin(t, s1, s2, s3) 94 95 for _, s := range servers { 96 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 97 } 98 99 // Bring up a new server 100 s4, cleanupS4 := TestServer(t, conf) 101 defer cleanupS4() 102 103 // Kill a non-leader server 104 s3.Shutdown() 105 retry.Run(t, func(r *retry.R) { 106 alive := 0 107 for _, m := range s1.Members() { 108 if m.Status == serf.StatusAlive { 109 alive++ 110 } 111 } 112 if alive != 2 { 113 r.Fatal(nil) 114 } 115 }) 116 117 // Join the new server 118 TestJoin(t, s1, s4) 119 servers[2] = s4 120 121 // Make sure the dead server is removed and we're back to 3 total peers 122 for _, s := range servers { 123 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 124 } 125 } 126 127 func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) { 128 t.Parallel() 129 130 s1, cleanupS1 := TestServer(t, nil) 131 defer cleanupS1() 132 133 conf := func(c *Config) { 134 c.DevDisableBootstrap = true 135 } 136 137 s2, cleanupS2 := TestServer(t, conf) 138 defer cleanupS2() 139 140 s3, cleanupS3 := TestServer(t, conf) 141 defer cleanupS3() 142 143 s4, cleanupS4 := TestServer(t, conf) 144 defer cleanupS4() 145 146 s5, cleanupS5 := TestServer(t, conf) 147 defer cleanupS5() 148 149 servers := []*Server{s1, s2, s3, s4, s5} 150 151 // Join the servers to s1, and wait until they are all promoted to 152 // voters. 153 TestJoin(t, s1, servers[1:]...) 154 retry.Run(t, func(r *retry.R) { 155 r.Check(wantRaft(servers)) 156 for _, s := range servers { 157 r.Check(wantPeers(s, 5)) 158 } 159 }) 160 161 // Kill a non-leader server 162 s4.Shutdown() 163 164 // Should be removed from the peers automatically 165 servers = []*Server{s1, s2, s3, s5} 166 retry.Run(t, func(r *retry.R) { 167 r.Check(wantRaft(servers)) 168 for _, s := range servers { 169 r.Check(wantPeers(s, 4)) 170 } 171 }) 172 } 173 174 func TestAutopilot_RollingUpdate(t *testing.T) { 175 t.Parallel() 176 177 s1, cleanupS1 := TestServer(t, func(c *Config) { 178 c.RaftConfig.ProtocolVersion = 3 179 }) 180 defer cleanupS1() 181 182 conf := func(c *Config) { 183 c.DevDisableBootstrap = true 184 c.RaftConfig.ProtocolVersion = 3 185 } 186 187 s2, cleanupS2 := TestServer(t, conf) 188 defer cleanupS2() 189 190 s3, cleanupS3 := TestServer(t, conf) 191 defer cleanupS3() 192 193 // Join the servers to s1, and wait until they are all promoted to 194 // voters. 195 servers := []*Server{s1, s2, s3} 196 TestJoin(t, s1, s2, s3) 197 retry.Run(t, func(r *retry.R) { 198 r.Check(wantRaft(servers)) 199 for _, s := range servers { 200 r.Check(wantPeers(s, 3)) 201 } 202 }) 203 204 // Add one more server like we are doing a rolling update. 205 s4, cleanupS4 := TestServer(t, conf) 206 defer cleanupS4() 207 TestJoin(t, s1, s4) 208 servers = append(servers, s4) 209 retry.Run(t, func(r *retry.R) { 210 r.Check(wantRaft(servers)) 211 for _, s := range servers { 212 r.Check(wantPeers(s, 3)) 213 } 214 }) 215 216 // Now kill one of the "old" nodes like we are doing a rolling update. 217 s3.Shutdown() 218 219 isVoter := func() bool { 220 future := s1.raft.GetConfiguration() 221 if err := future.Error(); err != nil { 222 t.Fatalf("err: %v", err) 223 } 224 for _, s := range future.Configuration().Servers { 225 if string(s.ID) == string(s4.config.NodeID) { 226 return s.Suffrage == raft.Voter 227 } 228 } 229 t.Fatalf("didn't find s4") 230 return false 231 } 232 233 // Wait for s4 to stabilize, get promoted to a voter, and for s3 to be 234 // removed. 235 servers = []*Server{s1, s2, s4} 236 retry.Run(t, func(r *retry.R) { 237 r.Check(wantRaft(servers)) 238 for _, s := range servers { 239 r.Check(wantPeers(s, 3)) 240 } 241 if !isVoter() { 242 r.Fatalf("should be a voter") 243 } 244 }) 245 } 246 247 func TestAutopilot_CleanupStaleRaftServer(t *testing.T) { 248 t.Skip("TestAutopilot_CleanupDeadServer is very flaky, removing it for now") 249 t.Parallel() 250 251 s1, cleanupS1 := TestServer(t, nil) 252 defer cleanupS1() 253 254 conf := func(c *Config) { 255 c.DevDisableBootstrap = true 256 } 257 s2, cleanupS2 := TestServer(t, conf) 258 defer cleanupS2() 259 260 s3, cleanupS3 := TestServer(t, conf) 261 defer cleanupS3() 262 263 s4, cleanupS4 := TestServer(t, conf) 264 defer cleanupS4() 265 266 servers := []*Server{s1, s2, s3} 267 268 // Join the servers to s1 269 TestJoin(t, s1, s2, s3) 270 271 leader := waitForStableLeadership(t, servers) 272 273 // Add s4 to peers directly 274 addr := fmt.Sprintf("127.0.0.1:%d", s4.config.RPCAddr.Port) 275 future := leader.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0) 276 if err := future.Error(); err != nil { 277 t.Fatal(err) 278 } 279 280 // Verify we have 4 peers 281 peers, err := s1.numPeers() 282 if err != nil { 283 t.Fatal(err) 284 } 285 if peers != 4 { 286 t.Fatalf("bad: %v", peers) 287 } 288 289 // Wait for s4 to be removed 290 for _, s := range []*Server{s1, s2, s3} { 291 retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) 292 } 293 } 294 295 func TestAutopilot_PromoteNonVoter(t *testing.T) { 296 t.Parallel() 297 298 s1, cleanupS1 := TestServer(t, func(c *Config) { 299 c.RaftConfig.ProtocolVersion = 3 300 }) 301 defer cleanupS1() 302 codec := rpcClient(t, s1) 303 defer codec.Close() 304 testutil.WaitForLeader(t, s1.RPC) 305 306 s2, cleanupS2 := TestServer(t, func(c *Config) { 307 c.DevDisableBootstrap = true 308 c.RaftConfig.ProtocolVersion = 3 309 }) 310 defer cleanupS2() 311 TestJoin(t, s1, s2) 312 313 // Make sure we see it as a nonvoter initially. We wait until half 314 // the stabilization period has passed. 315 retry.Run(t, func(r *retry.R) { 316 future := s1.raft.GetConfiguration() 317 if err := future.Error(); err != nil { 318 r.Fatal(err) 319 } 320 321 servers := future.Configuration().Servers 322 if len(servers) != 2 { 323 r.Fatalf("bad: %v", servers) 324 } 325 if servers[1].Suffrage != raft.Nonvoter { 326 r.Fatalf("bad: %v", servers) 327 } 328 health := s1.autopilot.GetServerHealth(string(servers[1].ID)) 329 if health == nil { 330 r.Fatalf("nil health, %v", s1.autopilot.GetClusterHealth()) 331 } 332 if !health.Healthy { 333 r.Fatalf("bad: %v", health) 334 } 335 if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 { 336 r.Fatal("stable period not elapsed") 337 } 338 }) 339 340 // Make sure it ends up as a voter. 341 retry.Run(t, func(r *retry.R) { 342 future := s1.raft.GetConfiguration() 343 if err := future.Error(); err != nil { 344 r.Fatal(err) 345 } 346 347 servers := future.Configuration().Servers 348 if len(servers) != 2 { 349 r.Fatalf("bad: %v", servers) 350 } 351 if servers[1].Suffrage != raft.Voter { 352 r.Fatalf("bad: %v", servers) 353 } 354 }) 355 }