github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/integration/cluster_test.go (about) 1 // Copyright 2015 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package integration 16 17 import ( 18 "context" 19 "fmt" 20 "log" 21 "math/rand" 22 "os" 23 "strconv" 24 "strings" 25 "testing" 26 "time" 27 28 clientv3 "github.com/lfch/etcd-io/client/v3" 29 "github.com/lfch/etcd-io/server/v3/etcdserver" 30 "github.com/lfch/etcd-io/tests/v3/framework/config" 31 "github.com/lfch/etcd-io/tests/v3/framework/integration" 32 ) 33 34 func init() { 35 // open microsecond-level time log for integration test debugging 36 log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile) 37 if t := os.Getenv("ETCD_ELECTION_TIMEOUT_TICKS"); t != "" { 38 if i, err := strconv.ParseInt(t, 10, 64); err == nil { 39 integration.ElectionTicks = int(i) 40 } 41 } 42 } 43 44 func TestClusterOf1(t *testing.T) { testCluster(t, 1) } 45 func TestClusterOf3(t *testing.T) { testCluster(t, 3) } 46 47 func testCluster(t *testing.T, size int) { 48 integration.BeforeTest(t) 49 c := integration.NewCluster(t, &integration.ClusterConfig{Size: size}) 50 defer c.Terminate(t) 51 clusterMustProgress(t, c.Members) 52 } 53 54 func TestTLSClusterOf3(t *testing.T) { 55 integration.BeforeTest(t) 56 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 3, PeerTLS: &integration.TestTLSInfo}) 57 defer c.Terminate(t) 58 clusterMustProgress(t, c.Members) 59 } 60 61 // Test that a cluster can progress when using separate client and server certs when peering. This supports certificate 62 // authorities that don't issue dual-usage certificates. 63 func TestTLSClusterOf3WithSpecificUsage(t *testing.T) { 64 integration.BeforeTest(t) 65 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 3, PeerTLS: &integration.TestTLSInfoWithSpecificUsage}) 66 defer c.Terminate(t) 67 clusterMustProgress(t, c.Members) 68 } 69 70 func TestDoubleClusterSizeOf1(t *testing.T) { testDoubleClusterSize(t, 1) } 71 func TestDoubleClusterSizeOf3(t *testing.T) { testDoubleClusterSize(t, 3) } 72 73 func testDoubleClusterSize(t *testing.T, size int) { 74 integration.BeforeTest(t) 75 c := integration.NewCluster(t, &integration.ClusterConfig{Size: size, DisableStrictReconfigCheck: true}) 76 defer c.Terminate(t) 77 78 for i := 0; i < size; i++ { 79 c.AddMember(t) 80 } 81 clusterMustProgress(t, c.Members) 82 } 83 84 func TestDoubleTLSClusterSizeOf3(t *testing.T) { 85 integration.BeforeTest(t) 86 cfg := &integration.ClusterConfig{ 87 Size: 1, 88 PeerTLS: &integration.TestTLSInfo, 89 DisableStrictReconfigCheck: true, 90 } 91 c := integration.NewCluster(t, cfg) 92 defer c.Terminate(t) 93 94 for i := 0; i < 3; i++ { 95 c.AddMember(t) 96 } 97 clusterMustProgress(t, c.Members) 98 } 99 100 func TestDecreaseClusterSizeOf3(t *testing.T) { testDecreaseClusterSize(t, 3) } 101 func TestDecreaseClusterSizeOf5(t *testing.T) { testDecreaseClusterSize(t, 5) } 102 103 func testDecreaseClusterSize(t *testing.T, size int) { 104 integration.BeforeTest(t) 105 c := integration.NewCluster(t, &integration.ClusterConfig{Size: size, DisableStrictReconfigCheck: true}) 106 defer c.Terminate(t) 107 108 // TODO: remove the last but one member 109 for i := 0; i < size-1; i++ { 110 id := c.Members[len(c.Members)-1].Server.MemberId() 111 // may hit second leader election on slow machines 112 if err := c.RemoveMember(t, c.Members[0].Client, uint64(id)); err != nil { 113 if strings.Contains(err.Error(), "no leader") { 114 t.Logf("got leader error (%v)", err) 115 i-- 116 continue 117 } 118 t.Fatal(err) 119 } 120 c.WaitMembersForLeader(t, c.Members) 121 } 122 clusterMustProgress(t, c.Members) 123 } 124 125 func TestForceNewCluster(t *testing.T) { 126 integration.BeforeTest(t) 127 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 3, UseBridge: true}) 128 defer c.Terminate(t) 129 130 ctx, cancel := context.WithTimeout(context.Background(), integration.RequestTimeout) 131 resp, err := c.Members[0].Client.Put(ctx, "/foo", "bar") 132 if err != nil { 133 t.Fatalf("unexpected create error: %v", err) 134 } 135 cancel() 136 // ensure create has been applied in this machine 137 ctx, cancel = context.WithTimeout(context.Background(), integration.RequestTimeout) 138 watch := c.Members[0].Client.Watcher.Watch(ctx, "/foo", clientv3.WithRev(resp.Header.Revision-1)) 139 for resp := range watch { 140 if len(resp.Events) != 0 { 141 break 142 } 143 if resp.Err() != nil { 144 t.Fatalf("unexpected watch error: %q", resp.Err()) 145 } 146 if resp.Canceled { 147 t.Fatalf("watch cancelled") 148 } 149 } 150 cancel() 151 152 c.Members[0].Stop(t) 153 c.Members[1].Terminate(t) 154 c.Members[2].Terminate(t) 155 c.Members[0].ForceNewCluster = true 156 err = c.Members[0].Restart(t) 157 if err != nil { 158 t.Fatalf("unexpected ForceRestart error: %v", err) 159 } 160 c.WaitMembersForLeader(t, c.Members[:1]) 161 162 // use new http client to init new connection 163 // ensure force restart keep the old data, and new Cluster can make progress 164 ctx, cancel = context.WithTimeout(context.Background(), integration.RequestTimeout) 165 watch = c.Members[0].Client.Watcher.Watch(ctx, "/foo", clientv3.WithRev(resp.Header.Revision-1)) 166 for resp := range watch { 167 if len(resp.Events) != 0 { 168 break 169 } 170 if resp.Err() != nil { 171 t.Fatalf("unexpected watch error: %q", resp.Err()) 172 } 173 if resp.Canceled { 174 t.Fatalf("watch cancelled") 175 } 176 } 177 cancel() 178 clusterMustProgress(t, c.Members[:1]) 179 } 180 181 func TestAddMemberAfterClusterFullRotation(t *testing.T) { 182 integration.BeforeTest(t) 183 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 3, DisableStrictReconfigCheck: true}) 184 defer c.Terminate(t) 185 186 // remove all the previous three members and add in three new members. 187 for i := 0; i < 3; i++ { 188 if err := c.RemoveMember(t, c.Members[0].Client, uint64(c.Members[1].Server.MemberId())); err != nil { 189 t.Fatal(err) 190 } 191 c.WaitMembersForLeader(t, c.Members) 192 193 c.AddMember(t) 194 c.WaitMembersForLeader(t, c.Members) 195 } 196 197 c.AddMember(t) 198 c.WaitMembersForLeader(t, c.Members) 199 200 clusterMustProgress(t, c.Members) 201 } 202 203 // Ensure we can remove a member then add a new one back immediately. 204 func TestIssue2681(t *testing.T) { 205 integration.BeforeTest(t) 206 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 5, DisableStrictReconfigCheck: true}) 207 defer c.Terminate(t) 208 209 if err := c.RemoveMember(t, c.Members[0].Client, uint64(c.Members[4].Server.MemberId())); err != nil { 210 t.Fatal(err) 211 } 212 c.WaitMembersForLeader(t, c.Members) 213 214 c.AddMember(t) 215 c.WaitMembersForLeader(t, c.Members) 216 clusterMustProgress(t, c.Members) 217 } 218 219 // Ensure we can remove a member after a snapshot then add a new one back. 220 func TestIssue2746(t *testing.T) { testIssue2746(t, 5) } 221 222 // With 3 nodes TestIssue2476 sometimes had a shutdown with an inflight snapshot. 223 func TestIssue2746WithThree(t *testing.T) { testIssue2746(t, 3) } 224 225 func testIssue2746(t *testing.T, members int) { 226 integration.BeforeTest(t) 227 c := integration.NewCluster(t, &integration.ClusterConfig{Size: members, SnapshotCount: 10, DisableStrictReconfigCheck: true}) 228 defer c.Terminate(t) 229 230 // force a snapshot 231 for i := 0; i < 20; i++ { 232 clusterMustProgress(t, c.Members) 233 } 234 235 if err := c.RemoveMember(t, c.Members[0].Client, uint64(c.Members[members-1].Server.MemberId())); err != nil { 236 t.Fatal(err) 237 } 238 c.WaitMembersForLeader(t, c.Members) 239 240 c.AddMember(t) 241 c.WaitMembersForLeader(t, c.Members) 242 clusterMustProgress(t, c.Members) 243 } 244 245 // Ensure etcd will not panic when removing a just started member. 246 func TestIssue2904(t *testing.T) { 247 integration.BeforeTest(t) 248 // start 1-member Cluster to ensure member 0 is the leader of the Cluster. 249 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 2, UseBridge: true, DisableStrictReconfigCheck: true}) 250 defer c.Terminate(t) 251 c.WaitLeader(t) 252 253 c.AddMember(t) 254 c.Members[2].Stop(t) 255 256 // send remove member-1 request to the Cluster. 257 ctx, cancel := context.WithTimeout(context.Background(), integration.RequestTimeout) 258 // the proposal is not committed because member 1 is stopped, but the 259 // proposal is appended to leader'Server raft log. 260 c.Members[0].Client.MemberRemove(ctx, uint64(c.Members[2].Server.MemberId())) 261 cancel() 262 263 // restart member, and expect it to send UpdateAttributes request. 264 // the log in the leader is like this: 265 // [..., remove 1, ..., update attr 1, ...] 266 c.Members[2].Restart(t) 267 // when the member comes back, it ack the proposal to remove itself, 268 // and apply it. 269 <-c.Members[2].Server.StopNotify() 270 271 // terminate removed member 272 c.Members[2].Client.Close() 273 c.Members[2].Terminate(t) 274 c.Members = c.Members[:2] 275 // wait member to be removed. 276 c.WaitMembersMatch(t, c.ProtoMembers()) 277 } 278 279 // TestIssue3699 tests minority failure during cluster configuration; it was 280 // deadlocking. 281 func TestIssue3699(t *testing.T) { 282 // start a Cluster of 3 nodes a, b, c 283 integration.BeforeTest(t) 284 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 3, UseBridge: true, DisableStrictReconfigCheck: true}) 285 defer c.Terminate(t) 286 287 // make node a unavailable 288 c.Members[0].Stop(t) 289 290 // add node d 291 c.AddMember(t) 292 293 t.Logf("Disturbing cluster till member:3 will become a leader") 294 295 // electing node d as leader makes node a unable to participate 296 leaderID := c.WaitMembersForLeader(t, c.Members) 297 for leaderID != 3 { 298 c.Members[leaderID].Stop(t) 299 <-c.Members[leaderID].Server.StopNotify() 300 // do not restart the killed member immediately. 301 // the member will advance its election timeout after restart, 302 // so it will have a better chance to become the leader again. 303 time.Sleep(time.Duration(integration.ElectionTicks * int(config.TickDuration))) 304 c.Members[leaderID].Restart(t) 305 leaderID = c.WaitMembersForLeader(t, c.Members) 306 } 307 308 t.Logf("Finally elected member 3 as the leader.") 309 310 t.Logf("Restarting member '0'...") 311 // bring back node a 312 // node a will remain useless as long as d is the leader. 313 if err := c.Members[0].Restart(t); err != nil { 314 t.Fatal(err) 315 } 316 t.Logf("Restarted member '0'.") 317 318 select { 319 // waiting for ReadyNotify can take several seconds 320 case <-time.After(10 * time.Second): 321 t.Fatalf("waited too long for ready notification") 322 case <-c.Members[0].Server.StopNotify(): 323 t.Fatalf("should not be stopped") 324 case <-c.Members[0].Server.ReadyNotify(): 325 } 326 // must WaitMembersForLeader so goroutines don't leak on terminate 327 c.WaitLeader(t) 328 329 t.Logf("Expecting successful put...") 330 // try to participate in Cluster 331 ctx, cancel := context.WithTimeout(context.Background(), integration.RequestTimeout) 332 if _, err := c.Members[0].Client.Put(ctx, "/foo", "bar"); err != nil { 333 t.Fatalf("unexpected error on Put (%v)", err) 334 } 335 cancel() 336 } 337 338 // TestRejectUnhealthyAdd ensures an unhealthy cluster rejects adding members. 339 func TestRejectUnhealthyAdd(t *testing.T) { 340 integration.BeforeTest(t) 341 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 3, UseBridge: true}) 342 defer c.Terminate(t) 343 344 // make Cluster unhealthy and wait for downed peer 345 c.Members[0].Stop(t) 346 c.WaitLeader(t) 347 348 // all attempts to add member should fail 349 for i := 1; i < len(c.Members); i++ { 350 err := c.AddMemberByURL(t, c.Members[i].Client, "unix://foo:12345") 351 if err == nil { 352 t.Fatalf("should have failed adding peer") 353 } 354 // TODO: client should return descriptive error codes for internal errors 355 if !strings.Contains(err.Error(), "unhealthy cluster") { 356 t.Errorf("unexpected error (%v)", err) 357 } 358 } 359 360 // make cluster healthy 361 c.Members[0].Restart(t) 362 c.WaitLeader(t) 363 time.Sleep(2 * etcdserver.HealthInterval) 364 365 // add member should succeed now that it'Server healthy 366 var err error 367 for i := 1; i < len(c.Members); i++ { 368 if err = c.AddMemberByURL(t, c.Members[i].Client, "unix://foo:12345"); err == nil { 369 break 370 } 371 } 372 if err != nil { 373 t.Fatalf("should have added peer to healthy Cluster (%v)", err) 374 } 375 } 376 377 // TestRejectUnhealthyRemove ensures an unhealthy cluster rejects removing members 378 // if quorum will be lost. 379 func TestRejectUnhealthyRemove(t *testing.T) { 380 integration.BeforeTest(t) 381 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 5, UseBridge: true}) 382 defer c.Terminate(t) 383 384 // make cluster unhealthy and wait for downed peer; (3 up, 2 down) 385 c.Members[0].Stop(t) 386 c.Members[1].Stop(t) 387 leader := c.WaitLeader(t) 388 389 // reject remove active member since (3,2)-(1,0) => (2,2) lacks quorum 390 err := c.RemoveMember(t, c.Members[leader].Client, uint64(c.Members[2].Server.MemberId())) 391 if err == nil { 392 t.Fatalf("should reject quorum breaking remove: %s", err) 393 } 394 // TODO: client should return more descriptive error codes for internal errors 395 if !strings.Contains(err.Error(), "unhealthy cluster") { 396 t.Errorf("unexpected error (%v)", err) 397 } 398 399 // member stopped after launch; wait for missing heartbeats 400 time.Sleep(time.Duration(integration.ElectionTicks * int(config.TickDuration))) 401 402 // permit remove dead member since (3,2) - (0,1) => (3,1) has quorum 403 if err = c.RemoveMember(t, c.Members[2].Client, uint64(c.Members[0].Server.MemberId())); err != nil { 404 t.Fatalf("should accept removing down member: %s", err) 405 } 406 407 // bring cluster to (4,1) 408 c.Members[0].Restart(t) 409 410 // restarted member must be connected for a HealthInterval before remove is accepted 411 time.Sleep((3 * etcdserver.HealthInterval) / 2) 412 413 // accept remove member since (4,1)-(1,0) => (3,1) has quorum 414 if err = c.RemoveMember(t, c.Members[1].Client, uint64(c.Members[0].Server.MemberId())); err != nil { 415 t.Fatalf("expected to remove member, got error %v", err) 416 } 417 } 418 419 // TestRestartRemoved ensures that restarting removed member must exit 420 // if 'initial-cluster-state' is set 'new' and old data directory still exists 421 // (see https://github.com/etcd-io/etcd/issues/7512 for more). 422 func TestRestartRemoved(t *testing.T) { 423 integration.BeforeTest(t) 424 425 // 1. start single-member Cluster 426 c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1}) 427 defer c.Terminate(t) 428 429 // 2. add a new member 430 c.Cfg.DisableStrictReconfigCheck = true 431 c.AddMember(t) 432 c.WaitLeader(t) 433 434 firstMember := c.Members[0] 435 firstMember.KeepDataDirTerminate = true 436 437 // 3. remove first member, shut down without deleting data 438 if err := c.RemoveMember(t, c.Members[1].Client, uint64(firstMember.Server.MemberId())); err != nil { 439 t.Fatalf("expected to remove member, got error %v", err) 440 } 441 c.WaitLeader(t) 442 443 // 4. restart first member with 'initial-cluster-state=new' 444 // wrong config, expects exit within ReqTimeout 445 firstMember.ServerConfig.NewCluster = false 446 if err := firstMember.Restart(t); err != nil { 447 t.Fatalf("unexpected ForceRestart error: %v", err) 448 } 449 defer func() { 450 firstMember.Close() 451 os.RemoveAll(firstMember.ServerConfig.DataDir) 452 }() 453 select { 454 case <-firstMember.Server.StopNotify(): 455 case <-time.After(time.Minute): 456 t.Fatalf("removed member didn't exit within %v", time.Minute) 457 } 458 } 459 460 // clusterMustProgress ensures that cluster can make progress. It creates 461 // a random key first, and check the new key could be got from all client urls 462 // of the cluster. 463 func clusterMustProgress(t *testing.T, members []*integration.Member) { 464 key := fmt.Sprintf("foo%d", rand.Int()) 465 var ( 466 err error 467 resp *clientv3.PutResponse 468 ) 469 // retry in case of leader loss induced by slow CI 470 for i := 0; i < 3; i++ { 471 ctx, cancel := context.WithTimeout(context.Background(), integration.RequestTimeout) 472 resp, err = members[0].Client.Put(ctx, key, "bar") 473 cancel() 474 if err == nil { 475 break 476 } 477 t.Logf("failed to create key on #0 (%v)", err) 478 } 479 if err != nil { 480 t.Fatalf("create on #0 error: %v", err) 481 } 482 483 for i, m := range members { 484 mctx, mcancel := context.WithTimeout(context.Background(), integration.RequestTimeout) 485 watch := m.Client.Watcher.Watch(mctx, key, clientv3.WithRev(resp.Header.Revision-1)) 486 for resp := range watch { 487 if len(resp.Events) != 0 { 488 break 489 } 490 if resp.Err() != nil { 491 t.Fatalf("#%d: watch error: %q", i, resp.Err()) 492 } 493 if resp.Canceled { 494 t.Fatalf("#%d: watch: cancelled", i) 495 } 496 } 497 mcancel() 498 } 499 } 500 501 func TestSpeedyTerminate(t *testing.T) { 502 integration.BeforeTest(t) 503 clus := integration.NewCluster(t, &integration.ClusterConfig{Size: 3, UseBridge: true}) 504 // Stop/Restart so requests will time out on lost leaders 505 for i := 0; i < 3; i++ { 506 clus.Members[i].Stop(t) 507 clus.Members[i].Restart(t) 508 } 509 donec := make(chan struct{}) 510 go func() { 511 defer close(donec) 512 clus.Terminate(t) 513 }() 514 select { 515 case <-time.After(10 * time.Second): 516 t.Fatalf("Cluster took too long to terminate") 517 case <-donec: 518 } 519 }