go.etcd.io/etcd@v3.3.27+incompatible/integration/cluster_test.go (about) 1 // Copyright 2015 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package integration 16 17 import ( 18 "context" 19 "fmt" 20 "log" 21 "math/rand" 22 "os" 23 "strconv" 24 "strings" 25 "testing" 26 "time" 27 28 "github.com/coreos/etcd/client" 29 "github.com/coreos/etcd/etcdserver" 30 "github.com/coreos/etcd/pkg/testutil" 31 32 "github.com/coreos/pkg/capnslog" 33 ) 34 35 func init() { 36 // open microsecond-level time log for integration test debugging 37 log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile) 38 if t := os.Getenv("ETCD_ELECTION_TIMEOUT_TICKS"); t != "" { 39 if i, err := strconv.ParseInt(t, 10, 64); err == nil { 40 electionTicks = int(i) 41 } 42 } 43 } 44 45 func TestClusterOf1(t *testing.T) { testCluster(t, 1) } 46 func TestClusterOf3(t *testing.T) { testCluster(t, 3) } 47 48 func testCluster(t *testing.T, size int) { 49 defer testutil.AfterTest(t) 50 c := NewCluster(t, size) 51 c.Launch(t) 52 defer c.Terminate(t) 53 clusterMustProgress(t, c.Members) 54 } 55 56 func TestTLSClusterOf3(t *testing.T) { 57 defer testutil.AfterTest(t) 58 c := NewClusterByConfig(t, &ClusterConfig{Size: 3, PeerTLS: &testTLSInfo}) 59 c.Launch(t) 60 defer c.Terminate(t) 61 clusterMustProgress(t, c.Members) 62 } 63 64 func TestClusterOf1UsingDiscovery(t *testing.T) { testClusterUsingDiscovery(t, 1) } 65 func TestClusterOf3UsingDiscovery(t *testing.T) { testClusterUsingDiscovery(t, 3) } 66 67 func testClusterUsingDiscovery(t *testing.T, size int) { 68 defer testutil.AfterTest(t) 69 dc := NewCluster(t, 1) 70 dc.Launch(t) 71 defer dc.Terminate(t) 72 // init discovery token space 73 dcc := MustNewHTTPClient(t, dc.URLs(), nil) 74 dkapi := client.NewKeysAPI(dcc) 75 ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) 76 if _, err := dkapi.Create(ctx, "/_config/size", fmt.Sprintf("%d", size)); err != nil { 77 t.Fatal(err) 78 } 79 cancel() 80 81 c := NewClusterByConfig( 82 t, 83 &ClusterConfig{Size: size, DiscoveryURL: dc.URL(0) + "/v2/keys"}, 84 ) 85 c.Launch(t) 86 defer c.Terminate(t) 87 clusterMustProgress(t, c.Members) 88 } 89 90 func TestTLSClusterOf3UsingDiscovery(t *testing.T) { 91 defer testutil.AfterTest(t) 92 dc := NewCluster(t, 1) 93 dc.Launch(t) 94 defer dc.Terminate(t) 95 // init discovery token space 96 dcc := MustNewHTTPClient(t, dc.URLs(), nil) 97 dkapi := client.NewKeysAPI(dcc) 98 ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) 99 if _, err := dkapi.Create(ctx, "/_config/size", fmt.Sprintf("%d", 3)); err != nil { 100 t.Fatal(err) 101 } 102 cancel() 103 104 c := NewClusterByConfig(t, 105 &ClusterConfig{ 106 Size: 3, 107 PeerTLS: &testTLSInfo, 108 DiscoveryURL: dc.URL(0) + "/v2/keys"}, 109 ) 110 c.Launch(t) 111 defer c.Terminate(t) 112 clusterMustProgress(t, c.Members) 113 } 114 115 func TestDoubleClusterSizeOf1(t *testing.T) { testDoubleClusterSize(t, 1) } 116 func TestDoubleClusterSizeOf3(t *testing.T) { testDoubleClusterSize(t, 3) } 117 118 func testDoubleClusterSize(t *testing.T, size int) { 119 defer testutil.AfterTest(t) 120 c := NewCluster(t, size) 121 c.Launch(t) 122 defer c.Terminate(t) 123 124 for i := 0; i < size; i++ { 125 c.AddMember(t) 126 } 127 clusterMustProgress(t, c.Members) 128 } 129 130 func TestDoubleTLSClusterSizeOf3(t *testing.T) { 131 defer testutil.AfterTest(t) 132 c := NewClusterByConfig(t, &ClusterConfig{Size: 3, PeerTLS: &testTLSInfo}) 133 c.Launch(t) 134 defer c.Terminate(t) 135 136 for i := 0; i < 3; i++ { 137 c.AddMember(t) 138 } 139 clusterMustProgress(t, c.Members) 140 } 141 142 func TestDecreaseClusterSizeOf3(t *testing.T) { testDecreaseClusterSize(t, 3) } 143 func TestDecreaseClusterSizeOf5(t *testing.T) { testDecreaseClusterSize(t, 5) } 144 145 func testDecreaseClusterSize(t *testing.T, size int) { 146 defer testutil.AfterTest(t) 147 c := NewCluster(t, size) 148 c.Launch(t) 149 defer c.Terminate(t) 150 151 // TODO: remove the last but one member 152 for i := 0; i < size-1; i++ { 153 id := c.Members[len(c.Members)-1].s.ID() 154 // may hit second leader election on slow machines 155 if err := c.removeMember(t, uint64(id)); err != nil { 156 if strings.Contains(err.Error(), "no leader") { 157 t.Logf("got leader error (%v)", err) 158 i-- 159 continue 160 } 161 t.Fatal(err) 162 } 163 c.waitLeader(t, c.Members) 164 } 165 clusterMustProgress(t, c.Members) 166 } 167 168 func TestForceNewCluster(t *testing.T) { 169 c := NewCluster(t, 3) 170 c.Launch(t) 171 cc := MustNewHTTPClient(t, []string{c.Members[0].URL()}, nil) 172 kapi := client.NewKeysAPI(cc) 173 ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) 174 resp, err := kapi.Create(ctx, "/foo", "bar") 175 if err != nil { 176 t.Fatalf("unexpected create error: %v", err) 177 } 178 cancel() 179 // ensure create has been applied in this machine 180 ctx, cancel = context.WithTimeout(context.Background(), requestTimeout) 181 if _, err = kapi.Watcher("/foo", &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(ctx); err != nil { 182 t.Fatalf("unexpected watch error: %v", err) 183 } 184 cancel() 185 186 c.Members[0].Stop(t) 187 c.Members[1].Terminate(t) 188 c.Members[2].Terminate(t) 189 c.Members[0].ForceNewCluster = true 190 err = c.Members[0].Restart(t) 191 if err != nil { 192 t.Fatalf("unexpected ForceRestart error: %v", err) 193 } 194 defer c.Members[0].Terminate(t) 195 c.waitLeader(t, c.Members[:1]) 196 197 // use new http client to init new connection 198 cc = MustNewHTTPClient(t, []string{c.Members[0].URL()}, nil) 199 kapi = client.NewKeysAPI(cc) 200 // ensure force restart keep the old data, and new cluster can make progress 201 ctx, cancel = context.WithTimeout(context.Background(), requestTimeout) 202 if _, err := kapi.Watcher("/foo", &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(ctx); err != nil { 203 t.Fatalf("unexpected watch error: %v", err) 204 } 205 cancel() 206 clusterMustProgress(t, c.Members[:1]) 207 } 208 209 func TestAddMemberAfterClusterFullRotation(t *testing.T) { 210 defer testutil.AfterTest(t) 211 c := NewCluster(t, 3) 212 c.Launch(t) 213 defer c.Terminate(t) 214 215 // remove all the previous three members and add in three new members. 216 for i := 0; i < 3; i++ { 217 c.RemoveMember(t, uint64(c.Members[0].s.ID())) 218 c.waitLeader(t, c.Members) 219 220 c.AddMember(t) 221 c.waitLeader(t, c.Members) 222 } 223 224 c.AddMember(t) 225 c.waitLeader(t, c.Members) 226 227 clusterMustProgress(t, c.Members) 228 } 229 230 // Ensure we can remove a member then add a new one back immediately. 231 func TestIssue2681(t *testing.T) { 232 defer testutil.AfterTest(t) 233 c := NewCluster(t, 5) 234 c.Launch(t) 235 defer c.Terminate(t) 236 237 c.RemoveMember(t, uint64(c.Members[4].s.ID())) 238 c.waitLeader(t, c.Members) 239 240 c.AddMember(t) 241 c.waitLeader(t, c.Members) 242 clusterMustProgress(t, c.Members) 243 } 244 245 // Ensure we can remove a member after a snapshot then add a new one back. 246 func TestIssue2746(t *testing.T) { testIssue2746(t, 5) } 247 248 // With 3 nodes TestIssue2476 sometimes had a shutdown with an inflight snapshot. 249 func TestIssue2746WithThree(t *testing.T) { testIssue2746(t, 3) } 250 251 func testIssue2746(t *testing.T, members int) { 252 defer testutil.AfterTest(t) 253 c := NewCluster(t, members) 254 255 for _, m := range c.Members { 256 m.SnapCount = 10 257 } 258 259 c.Launch(t) 260 defer c.Terminate(t) 261 262 // force a snapshot 263 for i := 0; i < 20; i++ { 264 clusterMustProgress(t, c.Members) 265 } 266 267 c.RemoveMember(t, uint64(c.Members[members-1].s.ID())) 268 c.waitLeader(t, c.Members) 269 270 c.AddMember(t) 271 c.waitLeader(t, c.Members) 272 clusterMustProgress(t, c.Members) 273 } 274 275 // Ensure etcd will not panic when removing a just started member. 276 func TestIssue2904(t *testing.T) { 277 defer testutil.AfterTest(t) 278 // start 1-member cluster to ensure member 0 is the leader of the cluster. 279 c := NewCluster(t, 1) 280 c.Launch(t) 281 defer c.Terminate(t) 282 283 c.AddMember(t) 284 c.Members[1].Stop(t) 285 286 // send remove member-1 request to the cluster. 287 cc := MustNewHTTPClient(t, c.URLs(), nil) 288 ma := client.NewMembersAPI(cc) 289 ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) 290 // the proposal is not committed because member 1 is stopped, but the 291 // proposal is appended to leader's raft log. 292 ma.Remove(ctx, c.Members[1].s.ID().String()) 293 cancel() 294 295 // restart member, and expect it to send UpdateAttributes request. 296 // the log in the leader is like this: 297 // [..., remove 1, ..., update attr 1, ...] 298 c.Members[1].Restart(t) 299 // when the member comes back, it ack the proposal to remove itself, 300 // and apply it. 301 <-c.Members[1].s.StopNotify() 302 303 // terminate removed member 304 c.Members[1].Terminate(t) 305 c.Members = c.Members[:1] 306 // wait member to be removed. 307 c.waitMembersMatch(t, c.HTTPMembers()) 308 } 309 310 // TestIssue3699 tests minority failure during cluster configuration; it was 311 // deadlocking. 312 func TestIssue3699(t *testing.T) { 313 // start a cluster of 3 nodes a, b, c 314 defer testutil.AfterTest(t) 315 c := NewCluster(t, 3) 316 c.Launch(t) 317 defer c.Terminate(t) 318 319 // make node a unavailable 320 c.Members[0].Stop(t) 321 322 // add node d 323 c.AddMember(t) 324 325 // electing node d as leader makes node a unable to participate 326 leaderID := c.waitLeader(t, c.Members) 327 for leaderID != 3 { 328 c.Members[leaderID].Stop(t) 329 <-c.Members[leaderID].s.StopNotify() 330 // do not restart the killed member immediately. 331 // the member will advance its election timeout after restart, 332 // so it will have a better chance to become the leader again. 333 time.Sleep(time.Duration(electionTicks * int(tickDuration))) 334 c.Members[leaderID].Restart(t) 335 leaderID = c.waitLeader(t, c.Members) 336 } 337 338 // bring back node a 339 // node a will remain useless as long as d is the leader. 340 if err := c.Members[0].Restart(t); err != nil { 341 t.Fatal(err) 342 } 343 select { 344 // waiting for ReadyNotify can take several seconds 345 case <-time.After(10 * time.Second): 346 t.Fatalf("waited too long for ready notification") 347 case <-c.Members[0].s.StopNotify(): 348 t.Fatalf("should not be stopped") 349 case <-c.Members[0].s.ReadyNotify(): 350 } 351 // must waitLeader so goroutines don't leak on terminate 352 c.waitLeader(t, c.Members) 353 354 // try to participate in cluster 355 cc := MustNewHTTPClient(t, []string{c.URL(0)}, c.cfg.ClientTLS) 356 kapi := client.NewKeysAPI(cc) 357 ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) 358 if _, err := kapi.Set(ctx, "/foo", "bar", nil); err != nil { 359 t.Fatalf("unexpected error on Set (%v)", err) 360 } 361 cancel() 362 } 363 364 // TestRejectUnhealthyAdd ensures an unhealthy cluster rejects adding members. 365 func TestRejectUnhealthyAdd(t *testing.T) { 366 defer testutil.AfterTest(t) 367 c := NewCluster(t, 3) 368 for _, m := range c.Members { 369 m.ServerConfig.StrictReconfigCheck = true 370 } 371 c.Launch(t) 372 defer c.Terminate(t) 373 374 // make cluster unhealthy and wait for downed peer 375 c.Members[0].Stop(t) 376 c.WaitLeader(t) 377 378 // all attempts to add member should fail 379 for i := 1; i < len(c.Members); i++ { 380 err := c.addMemberByURL(t, c.URL(i), "unix://foo:12345") 381 if err == nil { 382 t.Fatalf("should have failed adding peer") 383 } 384 // TODO: client should return descriptive error codes for internal errors 385 if !strings.Contains(err.Error(), "has no leader") { 386 t.Errorf("unexpected error (%v)", err) 387 } 388 } 389 390 // make cluster healthy 391 c.Members[0].Restart(t) 392 c.WaitLeader(t) 393 time.Sleep(2 * etcdserver.HealthInterval) 394 395 // add member should succeed now that it's healthy 396 var err error 397 for i := 1; i < len(c.Members); i++ { 398 if err = c.addMemberByURL(t, c.URL(i), "unix://foo:12345"); err == nil { 399 break 400 } 401 } 402 if err != nil { 403 t.Fatalf("should have added peer to healthy cluster (%v)", err) 404 } 405 } 406 407 // TestRejectUnhealthyRemove ensures an unhealthy cluster rejects removing members 408 // if quorum will be lost. 409 func TestRejectUnhealthyRemove(t *testing.T) { 410 defer testutil.AfterTest(t) 411 c := NewCluster(t, 5) 412 for _, m := range c.Members { 413 m.ServerConfig.StrictReconfigCheck = true 414 } 415 c.Launch(t) 416 defer c.Terminate(t) 417 418 // make cluster unhealthy and wait for downed peer; (3 up, 2 down) 419 c.Members[0].Stop(t) 420 c.Members[1].Stop(t) 421 c.WaitLeader(t) 422 423 // reject remove active member since (3,2)-(1,0) => (2,2) lacks quorum 424 err := c.removeMember(t, uint64(c.Members[2].s.ID())) 425 if err == nil { 426 t.Fatalf("should reject quorum breaking remove") 427 } 428 // TODO: client should return more descriptive error codes for internal errors 429 if !strings.Contains(err.Error(), "has no leader") { 430 t.Errorf("unexpected error (%v)", err) 431 } 432 433 // member stopped after launch; wait for missing heartbeats 434 time.Sleep(time.Duration(electionTicks * int(tickDuration))) 435 436 // permit remove dead member since (3,2) - (0,1) => (3,1) has quorum 437 if err = c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil { 438 t.Fatalf("should accept removing down member") 439 } 440 441 // bring cluster to (4,1) 442 c.Members[0].Restart(t) 443 444 // restarted member must be connected for a HealthInterval before remove is accepted 445 time.Sleep((3 * etcdserver.HealthInterval) / 2) 446 447 // accept remove member since (4,1)-(1,0) => (3,1) has quorum 448 if err = c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil { 449 t.Fatalf("expected to remove member, got error %v", err) 450 } 451 } 452 453 // TestRestartRemoved ensures that restarting removed member must exit 454 // if 'initial-cluster-state' is set 'new' and old data directory still exists 455 // (see https://github.com/coreos/etcd/issues/7512 for more). 456 func TestRestartRemoved(t *testing.T) { 457 defer testutil.AfterTest(t) 458 459 capnslog.SetGlobalLogLevel(capnslog.INFO) 460 defer capnslog.SetGlobalLogLevel(defaultLogLevel) 461 462 // 1. start single-member cluster 463 c := NewCluster(t, 1) 464 for _, m := range c.Members { 465 m.ServerConfig.StrictReconfigCheck = true 466 } 467 c.Launch(t) 468 defer c.Terminate(t) 469 470 // 2. add a new member 471 c.AddMember(t) 472 c.WaitLeader(t) 473 474 oldm := c.Members[0] 475 oldm.keepDataDirTerminate = true 476 477 // 3. remove first member, shut down without deleting data 478 if err := c.removeMember(t, uint64(c.Members[0].s.ID())); err != nil { 479 t.Fatalf("expected to remove member, got error %v", err) 480 } 481 c.WaitLeader(t) 482 483 // 4. restart first member with 'initial-cluster-state=new' 484 // wrong config, expects exit within ReqTimeout 485 oldm.ServerConfig.NewCluster = false 486 if err := oldm.Restart(t); err != nil { 487 t.Fatalf("unexpected ForceRestart error: %v", err) 488 } 489 defer func() { 490 oldm.Close() 491 os.RemoveAll(oldm.ServerConfig.DataDir) 492 }() 493 select { 494 case <-oldm.s.StopNotify(): 495 case <-time.After(time.Minute): 496 t.Fatalf("removed member didn't exit within %v", time.Minute) 497 } 498 } 499 500 // clusterMustProgress ensures that cluster can make progress. It creates 501 // a random key first, and check the new key could be got from all client urls 502 // of the cluster. 503 func clusterMustProgress(t *testing.T, membs []*member) { 504 cc := MustNewHTTPClient(t, []string{membs[0].URL()}, nil) 505 kapi := client.NewKeysAPI(cc) 506 key := fmt.Sprintf("foo%d", rand.Int()) 507 var ( 508 err error 509 resp *client.Response 510 ) 511 // retry in case of leader loss induced by slow CI 512 for i := 0; i < 3; i++ { 513 ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) 514 resp, err = kapi.Create(ctx, "/"+key, "bar") 515 cancel() 516 if err == nil { 517 break 518 } 519 t.Logf("failed to create key on %q (%v)", membs[0].URL(), err) 520 } 521 if err != nil { 522 t.Fatalf("create on %s error: %v", membs[0].URL(), err) 523 } 524 525 for i, m := range membs { 526 u := m.URL() 527 mcc := MustNewHTTPClient(t, []string{u}, nil) 528 mkapi := client.NewKeysAPI(mcc) 529 mctx, mcancel := context.WithTimeout(context.Background(), requestTimeout) 530 if _, err := mkapi.Watcher(key, &client.WatcherOptions{AfterIndex: resp.Node.ModifiedIndex - 1}).Next(mctx); err != nil { 531 t.Fatalf("#%d: watch on %s error: %v", i, u, err) 532 } 533 mcancel() 534 } 535 } 536 537 func TestSpeedyTerminate(t *testing.T) { 538 defer testutil.AfterTest(t) 539 clus := NewClusterV3(t, &ClusterConfig{Size: 3}) 540 // Stop/Restart so requests will time out on lost leaders 541 for i := 0; i < 3; i++ { 542 clus.Members[i].Stop(t) 543 clus.Members[i].Restart(t) 544 } 545 donec := make(chan struct{}) 546 go func() { 547 defer close(donec) 548 clus.Terminate(t) 549 }() 550 select { 551 case <-time.After(10 * time.Second): 552 t.Fatalf("cluster took too long to terminate") 553 case <-donec: 554 } 555 }