go.etcd.io/etcd@v3.3.27+incompatible/functional/tester/cluster.go (about) 1 // Copyright 2018 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tester 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "io" 22 "io/ioutil" 23 "math/rand" 24 "net/http" 25 "net/url" 26 "path/filepath" 27 "strings" 28 "sync" 29 "time" 30 31 "github.com/coreos/etcd/functional/rpcpb" 32 "github.com/coreos/etcd/pkg/debugutil" 33 "github.com/coreos/etcd/pkg/fileutil" 34 35 "github.com/prometheus/client_golang/prometheus/promhttp" 36 "go.uber.org/zap" 37 "golang.org/x/time/rate" 38 "google.golang.org/grpc" 39 ) 40 41 // Cluster defines tester cluster. 42 type Cluster struct { 43 lg *zap.Logger 44 45 agentConns []*grpc.ClientConn 46 agentClients []rpcpb.TransportClient 47 agentStreams []rpcpb.Transport_TransportClient 48 agentRequests []*rpcpb.Request 49 50 testerHTTPServer *http.Server 51 52 Members []*rpcpb.Member `yaml:"agent-configs"` 53 Tester *rpcpb.Tester `yaml:"tester-config"` 54 55 cases []Case 56 57 rateLimiter *rate.Limiter 58 stresser Stresser 59 checkers []Checker 60 61 currentRevision int64 62 rd int 63 cs int 64 } 65 66 var dialOpts = []grpc.DialOption{ 67 grpc.WithInsecure(), 68 grpc.WithTimeout(5 * time.Second), 69 grpc.WithBlock(), 70 } 71 72 // NewCluster creates a client from a tester configuration. 73 func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) { 74 clus, err := read(lg, fpath) 75 if err != nil { 76 return nil, err 77 } 78 79 clus.agentConns = make([]*grpc.ClientConn, len(clus.Members)) 80 clus.agentClients = make([]rpcpb.TransportClient, len(clus.Members)) 81 clus.agentStreams = make([]rpcpb.Transport_TransportClient, len(clus.Members)) 82 clus.agentRequests = make([]*rpcpb.Request, len(clus.Members)) 83 clus.cases = make([]Case, 0) 84 85 for i, ap := range clus.Members { 86 var err error 87 clus.agentConns[i], err = grpc.Dial(ap.AgentAddr, dialOpts...) 88 if err != nil { 89 return nil, err 90 } 91 clus.agentClients[i] = rpcpb.NewTransportClient(clus.agentConns[i]) 92 clus.lg.Info("connected", zap.String("agent-address", ap.AgentAddr)) 93 94 clus.agentStreams[i], err = clus.agentClients[i].Transport(context.Background()) 95 if err != nil { 96 return nil, err 97 } 98 clus.lg.Info("created stream", zap.String("agent-address", ap.AgentAddr)) 99 } 100 101 mux := http.NewServeMux() 102 mux.Handle("/metrics", promhttp.Handler()) 103 if clus.Tester.EnablePprof { 104 for p, h := range debugutil.PProfHandlers() { 105 mux.Handle(p, h) 106 } 107 } 108 clus.testerHTTPServer = &http.Server{ 109 Addr: clus.Tester.Addr, 110 Handler: mux, 111 } 112 go clus.serveTesterServer() 113 114 clus.updateCases() 115 116 clus.rateLimiter = rate.NewLimiter( 117 rate.Limit(int(clus.Tester.StressQPS)), 118 int(clus.Tester.StressQPS), 119 ) 120 121 clus.setStresserChecker() 122 123 return clus, nil 124 } 125 126 // EtcdClientEndpoints returns all etcd client endpoints. 127 func (clus *Cluster) EtcdClientEndpoints() (css []string) { 128 css = make([]string, len(clus.Members)) 129 for i := range clus.Members { 130 css[i] = clus.Members[i].EtcdClientEndpoint 131 } 132 return css 133 } 134 135 func (clus *Cluster) serveTesterServer() { 136 clus.lg.Info( 137 "started tester HTTP server", 138 zap.String("tester-address", clus.Tester.Addr), 139 ) 140 err := clus.testerHTTPServer.ListenAndServe() 141 clus.lg.Info( 142 "tester HTTP server returned", 143 zap.String("tester-address", clus.Tester.Addr), 144 zap.Error(err), 145 ) 146 if err != nil && err != http.ErrServerClosed { 147 clus.lg.Fatal("tester HTTP errored", zap.Error(err)) 148 } 149 } 150 151 func (clus *Cluster) updateCases() { 152 for _, cs := range clus.Tester.Cases { 153 switch cs { 154 case "SIGTERM_ONE_FOLLOWER": 155 clus.cases = append(clus.cases, 156 new_Case_SIGTERM_ONE_FOLLOWER(clus)) 157 case "SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 158 clus.cases = append(clus.cases, 159 new_Case_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus)) 160 case "SIGTERM_LEADER": 161 clus.cases = append(clus.cases, 162 new_Case_SIGTERM_LEADER(clus)) 163 case "SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT": 164 clus.cases = append(clus.cases, 165 new_Case_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus)) 166 case "SIGTERM_QUORUM": 167 clus.cases = append(clus.cases, 168 new_Case_SIGTERM_QUORUM(clus)) 169 case "SIGTERM_ALL": 170 clus.cases = append(clus.cases, 171 new_Case_SIGTERM_ALL(clus)) 172 173 case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER": 174 clus.cases = append(clus.cases, 175 new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus)) 176 case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 177 clus.cases = append(clus.cases, 178 new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus)) 179 case "SIGQUIT_AND_REMOVE_LEADER": 180 clus.cases = append(clus.cases, 181 new_Case_SIGQUIT_AND_REMOVE_LEADER(clus)) 182 case "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT": 183 clus.cases = append(clus.cases, 184 new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus)) 185 case "SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH": 186 clus.cases = append(clus.cases, 187 new_Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH(clus)) 188 189 case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER": 190 clus.cases = append(clus.cases, 191 new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus)) 192 case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 193 clus.cases = append(clus.cases, 194 new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT()) 195 case "BLACKHOLE_PEER_PORT_TX_RX_LEADER": 196 clus.cases = append(clus.cases, 197 new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus)) 198 case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT": 199 clus.cases = append(clus.cases, 200 new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT()) 201 case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM": 202 clus.cases = append(clus.cases, 203 new_Case_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus)) 204 case "BLACKHOLE_PEER_PORT_TX_RX_ALL": 205 clus.cases = append(clus.cases, 206 new_Case_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus)) 207 208 case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER": 209 clus.cases = append(clus.cases, 210 new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, false)) 211 case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER": 212 clus.cases = append(clus.cases, 213 new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, true)) 214 case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 215 clus.cases = append(clus.cases, 216 new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, false)) 217 case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 218 clus.cases = append(clus.cases, 219 new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, true)) 220 case "DELAY_PEER_PORT_TX_RX_LEADER": 221 clus.cases = append(clus.cases, 222 new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, false)) 223 case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER": 224 clus.cases = append(clus.cases, 225 new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, true)) 226 case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT": 227 clus.cases = append(clus.cases, 228 new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, false)) 229 case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT": 230 clus.cases = append(clus.cases, 231 new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, true)) 232 case "DELAY_PEER_PORT_TX_RX_QUORUM": 233 clus.cases = append(clus.cases, 234 new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, false)) 235 case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM": 236 clus.cases = append(clus.cases, 237 new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, true)) 238 case "DELAY_PEER_PORT_TX_RX_ALL": 239 clus.cases = append(clus.cases, 240 new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, false)) 241 case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL": 242 clus.cases = append(clus.cases, 243 new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, true)) 244 245 case "NO_FAIL_WITH_STRESS": 246 clus.cases = append(clus.cases, 247 new_Case_NO_FAIL_WITH_STRESS(clus)) 248 case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS": 249 clus.cases = append(clus.cases, 250 new_Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus)) 251 252 case "EXTERNAL": 253 clus.cases = append(clus.cases, 254 new_Case_EXTERNAL(clus.Tester.ExternalExecPath)) 255 case "FAILPOINTS": 256 fpFailures, fperr := failpointFailures(clus) 257 if len(fpFailures) == 0 { 258 clus.lg.Info("no failpoints found!", zap.Error(fperr)) 259 } 260 clus.cases = append(clus.cases, 261 fpFailures...) 262 } 263 } 264 } 265 266 func (clus *Cluster) listCases() (css []string) { 267 css = make([]string, len(clus.cases)) 268 for i := range clus.cases { 269 css[i] = clus.cases[i].Desc() 270 } 271 return css 272 } 273 274 // UpdateDelayLatencyMs updates delay latency with random value 275 // within election timeout. 276 func (clus *Cluster) UpdateDelayLatencyMs() { 277 rand.Seed(time.Now().UnixNano()) 278 clus.Tester.UpdatedDelayLatencyMs = uint32(rand.Int63n(clus.Members[0].Etcd.ElectionTimeoutMs)) 279 280 minLatRv := clus.Tester.DelayLatencyMsRv + clus.Tester.DelayLatencyMsRv/5 281 if clus.Tester.UpdatedDelayLatencyMs <= minLatRv { 282 clus.Tester.UpdatedDelayLatencyMs += minLatRv 283 } 284 } 285 286 func (clus *Cluster) setStresserChecker() { 287 css := &compositeStresser{} 288 lss := []*leaseStresser{} 289 rss := []*runnerStresser{} 290 for _, m := range clus.Members { 291 sss := newStresser(clus, m) 292 css.stressers = append(css.stressers, &compositeStresser{sss}) 293 for _, s := range sss { 294 if v, ok := s.(*leaseStresser); ok { 295 lss = append(lss, v) 296 clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint)) 297 } 298 if v, ok := s.(*runnerStresser); ok { 299 rss = append(rss, v) 300 clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint)) 301 } 302 } 303 } 304 clus.stresser = css 305 306 for _, cs := range clus.Tester.Checkers { 307 switch cs { 308 case "KV_HASH": 309 clus.checkers = append(clus.checkers, newKVHashChecker(clus)) 310 311 case "LEASE_EXPIRE": 312 for _, ls := range lss { 313 clus.checkers = append(clus.checkers, newLeaseExpireChecker(ls)) 314 } 315 316 case "RUNNER": 317 for _, rs := range rss { 318 clus.checkers = append(clus.checkers, newRunnerChecker(rs.etcdClientEndpoint, rs.errc)) 319 } 320 321 case "NO_CHECK": 322 clus.checkers = append(clus.checkers, newNoChecker()) 323 } 324 } 325 clus.lg.Info("updated stressers") 326 } 327 328 func (clus *Cluster) runCheckers(exceptions ...rpcpb.Checker) (err error) { 329 defer func() { 330 if err != nil { 331 return 332 } 333 if err = clus.updateRevision(); err != nil { 334 clus.lg.Warn( 335 "updateRevision failed", 336 zap.Error(err), 337 ) 338 return 339 } 340 }() 341 342 exs := make(map[rpcpb.Checker]struct{}) 343 for _, e := range exceptions { 344 exs[e] = struct{}{} 345 } 346 for _, chk := range clus.checkers { 347 clus.lg.Warn( 348 "consistency check START", 349 zap.String("checker", chk.Type().String()), 350 zap.Strings("client-endpoints", chk.EtcdClientEndpoints()), 351 ) 352 err = chk.Check() 353 clus.lg.Warn( 354 "consistency check END", 355 zap.String("checker", chk.Type().String()), 356 zap.Strings("client-endpoints", chk.EtcdClientEndpoints()), 357 zap.Error(err), 358 ) 359 if err != nil { 360 _, ok := exs[chk.Type()] 361 if !ok { 362 return err 363 } 364 clus.lg.Warn( 365 "consistency check SKIP FAIL", 366 zap.String("checker", chk.Type().String()), 367 zap.Strings("client-endpoints", chk.EtcdClientEndpoints()), 368 zap.Error(err), 369 ) 370 } 371 } 372 return nil 373 } 374 375 // Send_INITIAL_START_ETCD bootstraps etcd cluster the very first time. 376 // After this, just continue to call kill/restart. 377 func (clus *Cluster) Send_INITIAL_START_ETCD() error { 378 // this is the only time that creates request from scratch 379 return clus.broadcast(rpcpb.Operation_INITIAL_START_ETCD) 380 } 381 382 // send_SIGQUIT_ETCD_AND_ARCHIVE_DATA sends "send_SIGQUIT_ETCD_AND_ARCHIVE_DATA" operation. 383 func (clus *Cluster) send_SIGQUIT_ETCD_AND_ARCHIVE_DATA() error { 384 return clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA) 385 } 386 387 // send_RESTART_ETCD sends restart operation. 388 func (clus *Cluster) send_RESTART_ETCD() error { 389 return clus.broadcast(rpcpb.Operation_RESTART_ETCD) 390 } 391 392 func (clus *Cluster) broadcast(op rpcpb.Operation) error { 393 var wg sync.WaitGroup 394 wg.Add(len(clus.agentStreams)) 395 396 errc := make(chan error, len(clus.agentStreams)) 397 for i := range clus.agentStreams { 398 go func(idx int, o rpcpb.Operation) { 399 defer wg.Done() 400 errc <- clus.sendOp(idx, o) 401 }(i, op) 402 } 403 wg.Wait() 404 close(errc) 405 406 errs := []string{} 407 for err := range errc { 408 if err == nil { 409 continue 410 } 411 412 if err != nil { 413 destroyed := false 414 if op == rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT { 415 if err == io.EOF { 416 destroyed = true 417 } 418 if strings.Contains(err.Error(), 419 "rpc error: code = Unavailable desc = transport is closing") { 420 // agent server has already closed; 421 // so this error is expected 422 destroyed = true 423 } 424 if strings.Contains(err.Error(), 425 "desc = os: process already finished") { 426 destroyed = true 427 } 428 } 429 if !destroyed { 430 errs = append(errs, err.Error()) 431 } 432 } 433 } 434 435 if len(errs) == 0 { 436 return nil 437 } 438 return errors.New(strings.Join(errs, ", ")) 439 } 440 441 func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error { 442 _, err := clus.sendOpWithResp(idx, op) 443 return err 444 } 445 446 func (clus *Cluster) sendOpWithResp(idx int, op rpcpb.Operation) (*rpcpb.Response, error) { 447 // maintain the initial member object 448 // throughout the test time 449 clus.agentRequests[idx] = &rpcpb.Request{ 450 Operation: op, 451 Member: clus.Members[idx], 452 Tester: clus.Tester, 453 } 454 455 err := clus.agentStreams[idx].Send(clus.agentRequests[idx]) 456 clus.lg.Info( 457 "sent request", 458 zap.String("operation", op.String()), 459 zap.String("to", clus.Members[idx].EtcdClientEndpoint), 460 zap.Error(err), 461 ) 462 if err != nil { 463 return nil, err 464 } 465 466 resp, err := clus.agentStreams[idx].Recv() 467 if resp != nil { 468 clus.lg.Info( 469 "received response", 470 zap.String("operation", op.String()), 471 zap.String("from", clus.Members[idx].EtcdClientEndpoint), 472 zap.Bool("success", resp.Success), 473 zap.String("status", resp.Status), 474 zap.Error(err), 475 ) 476 } else { 477 clus.lg.Info( 478 "received empty response", 479 zap.String("operation", op.String()), 480 zap.String("from", clus.Members[idx].EtcdClientEndpoint), 481 zap.Error(err), 482 ) 483 } 484 if err != nil { 485 return nil, err 486 } 487 488 if !resp.Success { 489 return nil, errors.New(resp.Status) 490 } 491 492 m, secure := clus.Members[idx], false 493 for _, cu := range m.Etcd.AdvertiseClientURLs { 494 u, err := url.Parse(cu) 495 if err != nil { 496 return nil, err 497 } 498 if u.Scheme == "https" { // TODO: handle unix 499 secure = true 500 } 501 } 502 503 // store TLS assets from agents/servers onto disk 504 if secure && (op == rpcpb.Operation_INITIAL_START_ETCD || op == rpcpb.Operation_RESTART_ETCD) { 505 dirClient := filepath.Join( 506 clus.Tester.DataDir, 507 clus.Members[idx].Etcd.Name, 508 "fixtures", 509 "client", 510 ) 511 if err = fileutil.TouchDirAll(dirClient); err != nil { 512 return nil, err 513 } 514 515 clientCertData := []byte(resp.Member.ClientCertData) 516 if len(clientCertData) == 0 { 517 return nil, fmt.Errorf("got empty client cert from %q", m.EtcdClientEndpoint) 518 } 519 clientCertPath := filepath.Join(dirClient, "cert.pem") 520 if err = ioutil.WriteFile(clientCertPath, clientCertData, 0644); err != nil { // overwrite if exists 521 return nil, err 522 } 523 resp.Member.ClientCertPath = clientCertPath 524 clus.lg.Info( 525 "saved client cert file", 526 zap.String("path", clientCertPath), 527 ) 528 529 clientKeyData := []byte(resp.Member.ClientKeyData) 530 if len(clientKeyData) == 0 { 531 return nil, fmt.Errorf("got empty client key from %q", m.EtcdClientEndpoint) 532 } 533 clientKeyPath := filepath.Join(dirClient, "key.pem") 534 if err = ioutil.WriteFile(clientKeyPath, clientKeyData, 0644); err != nil { // overwrite if exists 535 return nil, err 536 } 537 resp.Member.ClientKeyPath = clientKeyPath 538 clus.lg.Info( 539 "saved client key file", 540 zap.String("path", clientKeyPath), 541 ) 542 543 clientTrustedCAData := []byte(resp.Member.ClientTrustedCAData) 544 if len(clientTrustedCAData) != 0 { 545 // TODO: disable this when auto TLS is deprecated 546 clientTrustedCAPath := filepath.Join(dirClient, "ca.pem") 547 if err = ioutil.WriteFile(clientTrustedCAPath, clientTrustedCAData, 0644); err != nil { // overwrite if exists 548 return nil, err 549 } 550 resp.Member.ClientTrustedCAPath = clientTrustedCAPath 551 clus.lg.Info( 552 "saved client trusted CA file", 553 zap.String("path", clientTrustedCAPath), 554 ) 555 } 556 557 // no need to store peer certs for tester clients 558 559 clus.Members[idx] = resp.Member 560 } 561 562 return resp, nil 563 } 564 565 // Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT terminates all tester connections to agents and etcd servers. 566 func (clus *Cluster) Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() { 567 err := clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT) 568 if err != nil { 569 clus.lg.Warn("destroying etcd/agents FAIL", zap.Error(err)) 570 } else { 571 clus.lg.Info("destroying etcd/agents PASS") 572 } 573 574 for i, conn := range clus.agentConns { 575 err := conn.Close() 576 clus.lg.Info("closed connection to agent", zap.String("agent-address", clus.Members[i].AgentAddr), zap.Error(err)) 577 } 578 579 if clus.testerHTTPServer != nil { 580 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) 581 err := clus.testerHTTPServer.Shutdown(ctx) 582 cancel() 583 clus.lg.Info("closed tester HTTP server", zap.String("tester-address", clus.Tester.Addr), zap.Error(err)) 584 } 585 } 586 587 // WaitHealth ensures all members are healthy 588 // by writing a test key to etcd cluster. 589 func (clus *Cluster) WaitHealth() error { 590 var err error 591 // wait 60s to check cluster health. 592 // TODO: set it to a reasonable value. It is set that high because 593 // follower may use long time to catch up the leader when reboot under 594 // reasonable workload (https://github.com/coreos/etcd/issues/2698) 595 for i := 0; i < 60; i++ { 596 for _, m := range clus.Members { 597 if err = m.WriteHealthKey(); err != nil { 598 clus.lg.Warn( 599 "health check FAIL", 600 zap.Int("retries", i), 601 zap.String("endpoint", m.EtcdClientEndpoint), 602 zap.Error(err), 603 ) 604 break 605 } 606 clus.lg.Info( 607 "health check PASS", 608 zap.Int("retries", i), 609 zap.String("endpoint", m.EtcdClientEndpoint), 610 ) 611 } 612 if err == nil { 613 clus.lg.Info("health check ALL PASS") 614 return nil 615 } 616 time.Sleep(time.Second) 617 } 618 return err 619 } 620 621 // GetLeader returns the index of leader and error if any. 622 func (clus *Cluster) GetLeader() (int, error) { 623 for i, m := range clus.Members { 624 isLeader, err := m.IsLeader() 625 if isLeader || err != nil { 626 return i, err 627 } 628 } 629 return 0, fmt.Errorf("no leader found") 630 } 631 632 // maxRev returns the maximum revision found on the cluster. 633 func (clus *Cluster) maxRev() (rev int64, err error) { 634 ctx, cancel := context.WithTimeout(context.TODO(), time.Second) 635 defer cancel() 636 revc, errc := make(chan int64, len(clus.Members)), make(chan error, len(clus.Members)) 637 for i := range clus.Members { 638 go func(m *rpcpb.Member) { 639 mrev, merr := m.Rev(ctx) 640 revc <- mrev 641 errc <- merr 642 }(clus.Members[i]) 643 } 644 for i := 0; i < len(clus.Members); i++ { 645 if merr := <-errc; merr != nil { 646 err = merr 647 } 648 if mrev := <-revc; mrev > rev { 649 rev = mrev 650 } 651 } 652 return rev, err 653 } 654 655 func (clus *Cluster) getRevisionHash() (map[string]int64, map[string]int64, error) { 656 revs := make(map[string]int64) 657 hashes := make(map[string]int64) 658 for _, m := range clus.Members { 659 rev, hash, err := m.RevHash() 660 if err != nil { 661 return nil, nil, err 662 } 663 revs[m.EtcdClientEndpoint] = rev 664 hashes[m.EtcdClientEndpoint] = hash 665 } 666 return revs, hashes, nil 667 } 668 669 func (clus *Cluster) compactKV(rev int64, timeout time.Duration) (err error) { 670 if rev <= 0 { 671 return nil 672 } 673 674 for i, m := range clus.Members { 675 clus.lg.Info( 676 "compact START", 677 zap.String("endpoint", m.EtcdClientEndpoint), 678 zap.Int64("compact-revision", rev), 679 zap.Duration("timeout", timeout), 680 ) 681 now := time.Now() 682 cerr := m.Compact(rev, timeout) 683 succeed := true 684 if cerr != nil { 685 if strings.Contains(cerr.Error(), "required revision has been compacted") && i > 0 { 686 clus.lg.Info( 687 "compact error is ignored", 688 zap.String("endpoint", m.EtcdClientEndpoint), 689 zap.Int64("compact-revision", rev), 690 zap.Error(cerr), 691 ) 692 } else { 693 clus.lg.Warn( 694 "compact FAIL", 695 zap.String("endpoint", m.EtcdClientEndpoint), 696 zap.Int64("compact-revision", rev), 697 zap.Error(cerr), 698 ) 699 err = cerr 700 succeed = false 701 } 702 } 703 704 if succeed { 705 clus.lg.Info( 706 "compact PASS", 707 zap.String("endpoint", m.EtcdClientEndpoint), 708 zap.Int64("compact-revision", rev), 709 zap.Duration("timeout", timeout), 710 zap.Duration("took", time.Since(now)), 711 ) 712 } 713 } 714 return err 715 } 716 717 func (clus *Cluster) checkCompact(rev int64) error { 718 if rev == 0 { 719 return nil 720 } 721 for _, m := range clus.Members { 722 if err := m.CheckCompact(rev); err != nil { 723 return err 724 } 725 } 726 return nil 727 } 728 729 func (clus *Cluster) defrag() error { 730 for _, m := range clus.Members { 731 if err := m.Defrag(); err != nil { 732 clus.lg.Warn( 733 "defrag FAIL", 734 zap.String("endpoint", m.EtcdClientEndpoint), 735 zap.Error(err), 736 ) 737 return err 738 } 739 clus.lg.Info( 740 "defrag PASS", 741 zap.String("endpoint", m.EtcdClientEndpoint), 742 ) 743 } 744 clus.lg.Info( 745 "defrag ALL PASS", 746 zap.Int("round", clus.rd), 747 zap.Int("case", clus.cs), 748 zap.Int("case-total", len(clus.cases)), 749 ) 750 return nil 751 } 752 753 // GetCaseDelayDuration computes failure delay duration. 754 func (clus *Cluster) GetCaseDelayDuration() time.Duration { 755 return time.Duration(clus.Tester.CaseDelayMs) * time.Millisecond 756 } 757 758 // Report reports the number of modified keys. 759 func (clus *Cluster) Report() int64 { 760 return clus.stresser.ModifiedKeys() 761 }