github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/tester/cluster.go (about) 1 // Copyright 2018 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tester 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "io" 22 "log" 23 "math/rand" 24 "net/http" 25 "net/url" 26 "os" 27 "path/filepath" 28 "strings" 29 "sync" 30 "time" 31 32 "github.com/lfch/etcd-io/client/pkg/v3/fileutil" 33 "github.com/lfch/etcd-io/pkg/v3/debugutil" 34 "github.com/lfch/etcd-io/tests/v3/functional/rpcpb" 35 36 "github.com/prometheus/client_golang/prometheus/promhttp" 37 "go.uber.org/zap" 38 "golang.org/x/time/rate" 39 "google.golang.org/grpc" 40 ) 41 42 // Cluster defines tester cluster. 43 type Cluster struct { 44 lg *zap.Logger 45 46 agentConns []*grpc.ClientConn 47 agentClients []rpcpb.TransportClient 48 agentStreams []rpcpb.Transport_TransportClient 49 agentRequests []*rpcpb.Request 50 51 testerHTTPServer *http.Server 52 53 Members []*rpcpb.Member `yaml:"agent-configs"` 54 Tester *rpcpb.Tester `yaml:"tester-config"` 55 56 cases []Case 57 58 rateLimiter *rate.Limiter 59 stresser Stresser 60 checkers []Checker 61 62 currentRevision int64 63 rd int 64 cs int 65 } 66 67 var dialOpts = []grpc.DialOption{ 68 grpc.WithInsecure(), 69 grpc.WithTimeout(5 * time.Second), 70 grpc.WithBlock(), 71 } 72 73 // NewCluster creates a cluster from a tester configuration. 74 func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) { 75 clus, err := read(lg, fpath) 76 if err != nil { 77 return nil, err 78 } 79 80 clus.agentConns = make([]*grpc.ClientConn, len(clus.Members)) 81 clus.agentClients = make([]rpcpb.TransportClient, len(clus.Members)) 82 clus.agentStreams = make([]rpcpb.Transport_TransportClient, len(clus.Members)) 83 clus.agentRequests = make([]*rpcpb.Request, len(clus.Members)) 84 clus.cases = make([]Case, 0) 85 86 lg.Info("creating members") 87 for i, ap := range clus.Members { 88 var err error 89 clus.agentConns[i], err = grpc.Dial(ap.AgentAddr, dialOpts...) 90 if err != nil { 91 return nil, fmt.Errorf("cannot dial agent %v: %v", ap.AgentAddr, err) 92 } 93 clus.agentClients[i] = rpcpb.NewTransportClient(clus.agentConns[i]) 94 lg.Info("connected", zap.String("agent-address", ap.AgentAddr)) 95 96 clus.agentStreams[i], err = clus.agentClients[i].Transport(context.Background()) 97 if err != nil { 98 return nil, err 99 } 100 lg.Info("created stream", zap.String("agent-address", ap.AgentAddr)) 101 } 102 103 lg.Info("agents configured.") 104 105 mux := http.NewServeMux() 106 mux.Handle("/metrics", promhttp.Handler()) 107 if clus.Tester.EnablePprof { 108 for p, h := range debugutil.PProfHandlers() { 109 mux.Handle(p, h) 110 } 111 } 112 clus.testerHTTPServer = &http.Server{ 113 Addr: clus.Tester.Addr, 114 Handler: mux, 115 ErrorLog: log.New(io.Discard, "net/http", 0), 116 } 117 go clus.serveTesterServer() 118 lg.Info("tester server started") 119 120 clus.rateLimiter = rate.NewLimiter( 121 rate.Limit(int(clus.Tester.StressQPS)), 122 int(clus.Tester.StressQPS), 123 ) 124 125 clus.setStresserChecker() 126 127 return clus, nil 128 } 129 130 // EtcdClientEndpoints returns all etcd client endpoints. 131 func (clus *Cluster) EtcdClientEndpoints() (css []string) { 132 css = make([]string, len(clus.Members)) 133 for i := range clus.Members { 134 css[i] = clus.Members[i].EtcdClientEndpoint 135 } 136 return css 137 } 138 139 func (clus *Cluster) serveTesterServer() { 140 clus.lg.Info( 141 "started tester HTTP server", 142 zap.String("tester-address", clus.Tester.Addr), 143 ) 144 err := clus.testerHTTPServer.ListenAndServe() 145 clus.lg.Info( 146 "tester HTTP server returned", 147 zap.String("tester-address", clus.Tester.Addr), 148 zap.Error(err), 149 ) 150 if err != nil && err != http.ErrServerClosed { 151 clus.lg.Fatal("tester HTTP errored", zap.Error(err)) 152 } 153 } 154 155 func (clus *Cluster) updateCases() { 156 for _, cs := range clus.Tester.Cases { 157 switch cs { 158 case "SIGTERM_ONE_FOLLOWER": 159 clus.cases = append(clus.cases, 160 new_Case_SIGTERM_ONE_FOLLOWER(clus)) 161 case "SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 162 clus.cases = append(clus.cases, 163 new_Case_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus)) 164 case "SIGTERM_LEADER": 165 clus.cases = append(clus.cases, 166 new_Case_SIGTERM_LEADER(clus)) 167 case "SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT": 168 clus.cases = append(clus.cases, 169 new_Case_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus)) 170 case "SIGTERM_QUORUM": 171 clus.cases = append(clus.cases, 172 new_Case_SIGTERM_QUORUM(clus)) 173 case "SIGTERM_ALL": 174 clus.cases = append(clus.cases, 175 new_Case_SIGTERM_ALL(clus)) 176 177 case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER": 178 clus.cases = append(clus.cases, 179 new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus)) 180 case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 181 clus.cases = append(clus.cases, 182 new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus)) 183 case "SIGQUIT_AND_REMOVE_LEADER": 184 clus.cases = append(clus.cases, 185 new_Case_SIGQUIT_AND_REMOVE_LEADER(clus)) 186 case "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT": 187 clus.cases = append(clus.cases, 188 new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus)) 189 case "SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH": 190 clus.cases = append(clus.cases, 191 new_Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH(clus)) 192 193 case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER": 194 clus.cases = append(clus.cases, 195 new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus)) 196 case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 197 clus.cases = append(clus.cases, 198 new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT()) 199 case "BLACKHOLE_PEER_PORT_TX_RX_LEADER": 200 clus.cases = append(clus.cases, 201 new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus)) 202 case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT": 203 clus.cases = append(clus.cases, 204 new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT()) 205 case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM": 206 clus.cases = append(clus.cases, 207 new_Case_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus)) 208 case "BLACKHOLE_PEER_PORT_TX_RX_ALL": 209 clus.cases = append(clus.cases, 210 new_Case_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus)) 211 212 case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER": 213 clus.cases = append(clus.cases, 214 new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, false)) 215 case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER": 216 clus.cases = append(clus.cases, 217 new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, true)) 218 case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 219 clus.cases = append(clus.cases, 220 new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, false)) 221 case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": 222 clus.cases = append(clus.cases, 223 new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, true)) 224 case "DELAY_PEER_PORT_TX_RX_LEADER": 225 clus.cases = append(clus.cases, 226 new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, false)) 227 case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER": 228 clus.cases = append(clus.cases, 229 new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, true)) 230 case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT": 231 clus.cases = append(clus.cases, 232 new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, false)) 233 case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT": 234 clus.cases = append(clus.cases, 235 new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, true)) 236 case "DELAY_PEER_PORT_TX_RX_QUORUM": 237 clus.cases = append(clus.cases, 238 new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, false)) 239 case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM": 240 clus.cases = append(clus.cases, 241 new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, true)) 242 case "DELAY_PEER_PORT_TX_RX_ALL": 243 clus.cases = append(clus.cases, 244 new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, false)) 245 case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL": 246 clus.cases = append(clus.cases, 247 new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, true)) 248 249 case "NO_FAIL_WITH_STRESS": 250 clus.cases = append(clus.cases, 251 new_Case_NO_FAIL_WITH_STRESS(clus)) 252 case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS": 253 clus.cases = append(clus.cases, 254 new_Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus)) 255 256 case "EXTERNAL": 257 clus.cases = append(clus.cases, 258 new_Case_EXTERNAL(clus.Tester.ExternalExecPath)) 259 case "FAILPOINTS": 260 fpFailures, fperr := failpointFailures(clus) 261 if len(fpFailures) == 0 { 262 clus.lg.Info("no failpoints found!", zap.Error(fperr)) 263 } 264 clus.cases = append(clus.cases, 265 fpFailures...) 266 case "FAILPOINTS_WITH_DISK_IO_LATENCY": 267 fpFailures, fperr := failpointDiskIOFailures(clus) 268 if len(fpFailures) == 0 { 269 clus.lg.Info("no failpoints found!", zap.Error(fperr)) 270 } 271 clus.cases = append(clus.cases, 272 fpFailures...) 273 } 274 } 275 } 276 277 func (clus *Cluster) listCases() (css []string) { 278 css = make([]string, len(clus.cases)) 279 for i := range clus.cases { 280 css[i] = clus.cases[i].Desc() 281 } 282 return css 283 } 284 285 // UpdateDelayLatencyMs updates delay latency with random value 286 // within election timeout. 287 func (clus *Cluster) UpdateDelayLatencyMs() { 288 rand.Seed(time.Now().UnixNano()) 289 clus.Tester.UpdatedDelayLatencyMs = uint32(rand.Int63n(clus.Members[0].Etcd.ElectionTimeoutMs)) 290 291 minLatRv := clus.Tester.DelayLatencyMsRv + clus.Tester.DelayLatencyMsRv/5 292 if clus.Tester.UpdatedDelayLatencyMs <= minLatRv { 293 clus.Tester.UpdatedDelayLatencyMs += minLatRv 294 } 295 } 296 297 func (clus *Cluster) setStresserChecker() { 298 css := &compositeStresser{} 299 var lss []*leaseStresser 300 var rss []*runnerStresser 301 for _, m := range clus.Members { 302 sss := newStresser(clus, m) 303 css.stressers = append(css.stressers, &compositeStresser{sss}) 304 for _, s := range sss { 305 if v, ok := s.(*leaseStresser); ok { 306 lss = append(lss, v) 307 clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint)) 308 } 309 if v, ok := s.(*runnerStresser); ok { 310 rss = append(rss, v) 311 clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint)) 312 } 313 } 314 } 315 clus.stresser = css 316 317 for _, cs := range clus.Tester.Checkers { 318 switch cs { 319 case "KV_HASH": 320 clus.checkers = append(clus.checkers, newKVHashChecker(clus)) 321 322 case "LEASE_EXPIRE": 323 for _, ls := range lss { 324 clus.checkers = append(clus.checkers, newLeaseExpireChecker(ls)) 325 } 326 327 case "RUNNER": 328 for _, rs := range rss { 329 clus.checkers = append(clus.checkers, newRunnerChecker(rs.etcdClientEndpoint, rs.errc)) 330 } 331 332 case "NO_CHECK": 333 clus.checkers = append(clus.checkers, newNoChecker()) 334 335 case "SHORT_TTL_LEASE_EXPIRE": 336 for _, ls := range lss { 337 clus.checkers = append(clus.checkers, newShortTTLLeaseExpireChecker(ls)) 338 } 339 } 340 } 341 clus.lg.Info("updated stressers") 342 } 343 344 func (clus *Cluster) runCheckers(exceptions ...rpcpb.Checker) (err error) { 345 defer func() { 346 if err != nil { 347 return 348 } 349 if err = clus.updateRevision(); err != nil { 350 clus.lg.Warn( 351 "updateRevision failed", 352 zap.Error(err), 353 ) 354 return 355 } 356 }() 357 358 exs := make(map[rpcpb.Checker]struct{}) 359 for _, e := range exceptions { 360 exs[e] = struct{}{} 361 } 362 for _, chk := range clus.checkers { 363 clus.lg.Warn( 364 "consistency check START", 365 zap.String("checker", chk.Type().String()), 366 zap.Strings("client-endpoints", chk.EtcdClientEndpoints()), 367 ) 368 err = chk.Check() 369 clus.lg.Warn( 370 "consistency check END", 371 zap.String("checker", chk.Type().String()), 372 zap.Strings("client-endpoints", chk.EtcdClientEndpoints()), 373 zap.Error(err), 374 ) 375 if err != nil { 376 _, ok := exs[chk.Type()] 377 if !ok { 378 return err 379 } 380 clus.lg.Warn( 381 "consistency check SKIP FAIL", 382 zap.String("checker", chk.Type().String()), 383 zap.Strings("client-endpoints", chk.EtcdClientEndpoints()), 384 zap.Error(err), 385 ) 386 } 387 } 388 return nil 389 } 390 391 // Send_INITIAL_START_ETCD bootstraps etcd cluster the very first time. 392 // After this, just continue to call kill/restart. 393 func (clus *Cluster) Send_INITIAL_START_ETCD() error { 394 // this is the only time that creates request from scratch 395 return clus.broadcast(rpcpb.Operation_INITIAL_START_ETCD) 396 } 397 398 // send_SIGQUIT_ETCD_AND_ARCHIVE_DATA sends "send_SIGQUIT_ETCD_AND_ARCHIVE_DATA" operation. 399 func (clus *Cluster) send_SIGQUIT_ETCD_AND_ARCHIVE_DATA() error { 400 return clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA) 401 } 402 403 // send_RESTART_ETCD sends restart operation. 404 func (clus *Cluster) send_RESTART_ETCD() error { 405 return clus.broadcast(rpcpb.Operation_RESTART_ETCD) 406 } 407 408 func (clus *Cluster) broadcast(op rpcpb.Operation) error { 409 var wg sync.WaitGroup 410 wg.Add(len(clus.agentStreams)) 411 412 errc := make(chan error, len(clus.agentStreams)) 413 for i := range clus.agentStreams { 414 go func(idx int, o rpcpb.Operation) { 415 defer wg.Done() 416 errc <- clus.sendOp(idx, o) 417 }(i, op) 418 } 419 wg.Wait() 420 close(errc) 421 422 var errs []string 423 for err := range errc { 424 if err == nil { 425 continue 426 } 427 428 if err != nil { 429 destroyed := false 430 if op == rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT { 431 if err == io.EOF { 432 destroyed = true 433 } 434 if strings.Contains(err.Error(), 435 "rpc error: code = Unavailable desc = transport is closing") { 436 // agent server has already closed; 437 // so this error is expected 438 destroyed = true 439 } 440 if strings.Contains(err.Error(), 441 "desc = os: process already finished") { 442 destroyed = true 443 } 444 } 445 if !destroyed { 446 errs = append(errs, err.Error()) 447 } 448 } 449 } 450 451 if len(errs) == 0 { 452 return nil 453 } 454 return errors.New(strings.Join(errs, ", ")) 455 } 456 457 func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error { 458 _, err := clus.sendOpWithResp(idx, op) 459 return err 460 } 461 462 func (clus *Cluster) sendOpWithResp(idx int, op rpcpb.Operation) (*rpcpb.Response, error) { 463 // maintain the initial member object 464 // throughout the test time 465 clus.agentRequests[idx] = &rpcpb.Request{ 466 Operation: op, 467 Member: clus.Members[idx], 468 Tester: clus.Tester, 469 } 470 471 err := clus.agentStreams[idx].Send(clus.agentRequests[idx]) 472 clus.lg.Info( 473 "sent request", 474 zap.String("operation", op.String()), 475 zap.String("to", clus.Members[idx].EtcdClientEndpoint), 476 zap.Error(err), 477 ) 478 if err != nil { 479 return nil, err 480 } 481 482 resp, err := clus.agentStreams[idx].Recv() 483 if resp != nil { 484 clus.lg.Info( 485 "received response", 486 zap.String("operation", op.String()), 487 zap.String("from", clus.Members[idx].EtcdClientEndpoint), 488 zap.Bool("success", resp.Success), 489 zap.String("status", resp.Status), 490 zap.Error(err), 491 ) 492 } else { 493 clus.lg.Info( 494 "received empty response", 495 zap.String("operation", op.String()), 496 zap.String("from", clus.Members[idx].EtcdClientEndpoint), 497 zap.Error(err), 498 ) 499 } 500 if err != nil { 501 return nil, err 502 } 503 504 if !resp.Success { 505 return nil, errors.New(resp.Status) 506 } 507 508 m, secure := clus.Members[idx], false 509 for _, cu := range m.Etcd.AdvertiseClientURLs { 510 u, perr := url.Parse(cu) 511 if perr != nil { 512 return nil, perr 513 } 514 if u.Scheme == "https" { // TODO: handle unix 515 secure = true 516 } 517 } 518 519 // store TLS assets from agents/servers onto disk 520 if secure && (op == rpcpb.Operation_INITIAL_START_ETCD || op == rpcpb.Operation_RESTART_ETCD) { 521 dirClient := filepath.Join( 522 clus.Tester.DataDir, 523 clus.Members[idx].Etcd.Name, 524 "fixtures", 525 "client", 526 ) 527 if err = fileutil.TouchDirAll(clus.lg, dirClient); err != nil { 528 return nil, err 529 } 530 531 clientCertData := []byte(resp.Member.ClientCertData) 532 if len(clientCertData) == 0 { 533 return nil, fmt.Errorf("got empty client cert from %q", m.EtcdClientEndpoint) 534 } 535 clientCertPath := filepath.Join(dirClient, "cert.pem") 536 if err = os.WriteFile(clientCertPath, clientCertData, 0644); err != nil { // overwrite if exists 537 return nil, err 538 } 539 resp.Member.ClientCertPath = clientCertPath 540 clus.lg.Info( 541 "saved client cert file", 542 zap.String("path", clientCertPath), 543 ) 544 545 clientKeyData := []byte(resp.Member.ClientKeyData) 546 if len(clientKeyData) == 0 { 547 return nil, fmt.Errorf("got empty client key from %q", m.EtcdClientEndpoint) 548 } 549 clientKeyPath := filepath.Join(dirClient, "key.pem") 550 if err = os.WriteFile(clientKeyPath, clientKeyData, 0644); err != nil { // overwrite if exists 551 return nil, err 552 } 553 resp.Member.ClientKeyPath = clientKeyPath 554 clus.lg.Info( 555 "saved client key file", 556 zap.String("path", clientKeyPath), 557 ) 558 559 clientTrustedCAData := []byte(resp.Member.ClientTrustedCAData) 560 if len(clientTrustedCAData) != 0 { 561 // TODO: disable this when auto TLS is deprecated 562 clientTrustedCAPath := filepath.Join(dirClient, "ca.pem") 563 if err = os.WriteFile(clientTrustedCAPath, clientTrustedCAData, 0644); err != nil { // overwrite if exists 564 return nil, err 565 } 566 resp.Member.ClientTrustedCAPath = clientTrustedCAPath 567 clus.lg.Info( 568 "saved client trusted CA file", 569 zap.String("path", clientTrustedCAPath), 570 ) 571 } 572 573 // no need to store peer certs for tester clients 574 575 clus.Members[idx] = resp.Member 576 } 577 578 return resp, nil 579 } 580 581 // Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT terminates all tester connections to agents and etcd servers. 582 func (clus *Cluster) Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() { 583 err := clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT) 584 if err != nil { 585 clus.lg.Warn("destroying etcd/agents FAIL", zap.Error(err)) 586 } else { 587 clus.lg.Info("destroying etcd/agents PASS") 588 } 589 590 for i, conn := range clus.agentConns { 591 err := conn.Close() 592 clus.lg.Info("closed connection to agent", zap.String("agent-address", clus.Members[i].AgentAddr), zap.Error(err)) 593 } 594 595 if clus.testerHTTPServer != nil { 596 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) 597 err := clus.testerHTTPServer.Shutdown(ctx) 598 cancel() 599 clus.lg.Info("closed tester HTTP server", zap.String("tester-address", clus.Tester.Addr), zap.Error(err)) 600 } 601 } 602 603 // WaitHealth ensures all members are healthy 604 // by writing a test key to etcd cluster. 605 func (clus *Cluster) WaitHealth() error { 606 var err error 607 // wait 60s to check cluster health. 608 // TODO: set it to a reasonable value. It is set that high because 609 // follower may use long time to catch up the leader when reboot under 610 // reasonable workload (https://github.com/etcd-io/etcd/issues/2698) 611 for i := 0; i < 60; i++ { 612 for _, m := range clus.Members { 613 if err = m.WriteHealthKey(); err != nil { 614 clus.lg.Warn( 615 "health check FAIL", 616 zap.Int("retries", i), 617 zap.String("endpoint", m.EtcdClientEndpoint), 618 zap.Error(err), 619 ) 620 break 621 } 622 clus.lg.Info( 623 "health check PASS", 624 zap.Int("retries", i), 625 zap.String("endpoint", m.EtcdClientEndpoint), 626 ) 627 } 628 if err == nil { 629 clus.lg.Info("health check ALL PASS") 630 return nil 631 } 632 time.Sleep(time.Second) 633 } 634 return err 635 } 636 637 // GetLeader returns the index of leader and error if any. 638 func (clus *Cluster) GetLeader() (int, error) { 639 for i, m := range clus.Members { 640 isLeader, err := m.IsLeader() 641 if isLeader || err != nil { 642 return i, err 643 } 644 } 645 return 0, fmt.Errorf("no leader found") 646 } 647 648 // maxRev returns the maximum revision found on the cluster. 649 func (clus *Cluster) maxRev() (rev int64, err error) { 650 ctx, cancel := context.WithTimeout(context.TODO(), time.Second) 651 defer cancel() 652 revc, errc := make(chan int64, len(clus.Members)), make(chan error, len(clus.Members)) 653 for i := range clus.Members { 654 go func(m *rpcpb.Member) { 655 mrev, merr := m.Rev(ctx) 656 revc <- mrev 657 errc <- merr 658 }(clus.Members[i]) 659 } 660 for i := 0; i < len(clus.Members); i++ { 661 if merr := <-errc; merr != nil { 662 err = merr 663 } 664 if mrev := <-revc; mrev > rev { 665 rev = mrev 666 } 667 } 668 return rev, err 669 } 670 671 func (clus *Cluster) getRevisionHash() (map[string]int64, map[string]int64, error) { 672 revs := make(map[string]int64) 673 hashes := make(map[string]int64) 674 for _, m := range clus.Members { 675 rev, hash, err := m.RevHash() 676 if err != nil { 677 return nil, nil, err 678 } 679 revs[m.EtcdClientEndpoint] = rev 680 hashes[m.EtcdClientEndpoint] = hash 681 } 682 return revs, hashes, nil 683 } 684 685 func (clus *Cluster) compactKV(rev int64, timeout time.Duration) (err error) { 686 if rev <= 0 { 687 return nil 688 } 689 690 for i, m := range clus.Members { 691 clus.lg.Info( 692 "compact START", 693 zap.String("endpoint", m.EtcdClientEndpoint), 694 zap.Int64("compact-revision", rev), 695 zap.Duration("timeout", timeout), 696 ) 697 now := time.Now() 698 cerr := m.Compact(rev, timeout) 699 succeed := true 700 if cerr != nil { 701 if strings.Contains(cerr.Error(), "required revision has been compacted") && i > 0 { 702 clus.lg.Info( 703 "compact error is ignored", 704 zap.String("endpoint", m.EtcdClientEndpoint), 705 zap.Int64("compact-revision", rev), 706 zap.String("expected-error-msg", cerr.Error()), 707 ) 708 } else { 709 clus.lg.Warn( 710 "compact FAIL", 711 zap.String("endpoint", m.EtcdClientEndpoint), 712 zap.Int64("compact-revision", rev), 713 zap.Error(cerr), 714 ) 715 err = cerr 716 succeed = false 717 } 718 } 719 720 if succeed { 721 clus.lg.Info( 722 "compact PASS", 723 zap.String("endpoint", m.EtcdClientEndpoint), 724 zap.Int64("compact-revision", rev), 725 zap.Duration("timeout", timeout), 726 zap.Duration("took", time.Since(now)), 727 ) 728 } 729 } 730 return err 731 } 732 733 func (clus *Cluster) checkCompact(rev int64) error { 734 if rev == 0 { 735 return nil 736 } 737 for _, m := range clus.Members { 738 if err := m.CheckCompact(rev); err != nil { 739 return err 740 } 741 } 742 return nil 743 } 744 745 func (clus *Cluster) defrag() error { 746 for _, m := range clus.Members { 747 if err := m.Defrag(); err != nil { 748 clus.lg.Warn( 749 "defrag FAIL", 750 zap.String("endpoint", m.EtcdClientEndpoint), 751 zap.Error(err), 752 ) 753 return err 754 } 755 clus.lg.Info( 756 "defrag PASS", 757 zap.String("endpoint", m.EtcdClientEndpoint), 758 ) 759 } 760 clus.lg.Info( 761 "defrag ALL PASS", 762 zap.Int("round", clus.rd), 763 zap.Int("case", clus.cs), 764 zap.Int("case-total", len(clus.cases)), 765 ) 766 return nil 767 } 768 769 // GetCaseDelayDuration computes failure delay duration. 770 func (clus *Cluster) GetCaseDelayDuration() time.Duration { 771 return time.Duration(clus.Tester.CaseDelayMs) * time.Millisecond 772 } 773 774 // Report reports the number of modified keys. 775 func (clus *Cluster) Report() int64 { 776 return clus.stresser.ModifiedKeys() 777 }