github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/tester/cluster.go (about)

     1  // Copyright 2018 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tester
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"log"
    23  	"math/rand"
    24  	"net/http"
    25  	"net/url"
    26  	"os"
    27  	"path/filepath"
    28  	"strings"
    29  	"sync"
    30  	"time"
    31  
    32  	"github.com/lfch/etcd-io/client/pkg/v3/fileutil"
    33  	"github.com/lfch/etcd-io/pkg/v3/debugutil"
    34  	"github.com/lfch/etcd-io/tests/v3/functional/rpcpb"
    35  
    36  	"github.com/prometheus/client_golang/prometheus/promhttp"
    37  	"go.uber.org/zap"
    38  	"golang.org/x/time/rate"
    39  	"google.golang.org/grpc"
    40  )
    41  
    42  // Cluster defines tester cluster.
    43  type Cluster struct {
    44  	lg *zap.Logger
    45  
    46  	agentConns    []*grpc.ClientConn
    47  	agentClients  []rpcpb.TransportClient
    48  	agentStreams  []rpcpb.Transport_TransportClient
    49  	agentRequests []*rpcpb.Request
    50  
    51  	testerHTTPServer *http.Server
    52  
    53  	Members []*rpcpb.Member `yaml:"agent-configs"`
    54  	Tester  *rpcpb.Tester   `yaml:"tester-config"`
    55  
    56  	cases []Case
    57  
    58  	rateLimiter *rate.Limiter
    59  	stresser    Stresser
    60  	checkers    []Checker
    61  
    62  	currentRevision int64
    63  	rd              int
    64  	cs              int
    65  }
    66  
    67  var dialOpts = []grpc.DialOption{
    68  	grpc.WithInsecure(),
    69  	grpc.WithTimeout(5 * time.Second),
    70  	grpc.WithBlock(),
    71  }
    72  
    73  // NewCluster creates a cluster from a tester configuration.
    74  func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
    75  	clus, err := read(lg, fpath)
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  
    80  	clus.agentConns = make([]*grpc.ClientConn, len(clus.Members))
    81  	clus.agentClients = make([]rpcpb.TransportClient, len(clus.Members))
    82  	clus.agentStreams = make([]rpcpb.Transport_TransportClient, len(clus.Members))
    83  	clus.agentRequests = make([]*rpcpb.Request, len(clus.Members))
    84  	clus.cases = make([]Case, 0)
    85  
    86  	lg.Info("creating members")
    87  	for i, ap := range clus.Members {
    88  		var err error
    89  		clus.agentConns[i], err = grpc.Dial(ap.AgentAddr, dialOpts...)
    90  		if err != nil {
    91  			return nil, fmt.Errorf("cannot dial agent %v: %v", ap.AgentAddr, err)
    92  		}
    93  		clus.agentClients[i] = rpcpb.NewTransportClient(clus.agentConns[i])
    94  		lg.Info("connected", zap.String("agent-address", ap.AgentAddr))
    95  
    96  		clus.agentStreams[i], err = clus.agentClients[i].Transport(context.Background())
    97  		if err != nil {
    98  			return nil, err
    99  		}
   100  		lg.Info("created stream", zap.String("agent-address", ap.AgentAddr))
   101  	}
   102  
   103  	lg.Info("agents configured.")
   104  
   105  	mux := http.NewServeMux()
   106  	mux.Handle("/metrics", promhttp.Handler())
   107  	if clus.Tester.EnablePprof {
   108  		for p, h := range debugutil.PProfHandlers() {
   109  			mux.Handle(p, h)
   110  		}
   111  	}
   112  	clus.testerHTTPServer = &http.Server{
   113  		Addr:     clus.Tester.Addr,
   114  		Handler:  mux,
   115  		ErrorLog: log.New(io.Discard, "net/http", 0),
   116  	}
   117  	go clus.serveTesterServer()
   118  	lg.Info("tester server started")
   119  
   120  	clus.rateLimiter = rate.NewLimiter(
   121  		rate.Limit(int(clus.Tester.StressQPS)),
   122  		int(clus.Tester.StressQPS),
   123  	)
   124  
   125  	clus.setStresserChecker()
   126  
   127  	return clus, nil
   128  }
   129  
   130  // EtcdClientEndpoints returns all etcd client endpoints.
   131  func (clus *Cluster) EtcdClientEndpoints() (css []string) {
   132  	css = make([]string, len(clus.Members))
   133  	for i := range clus.Members {
   134  		css[i] = clus.Members[i].EtcdClientEndpoint
   135  	}
   136  	return css
   137  }
   138  
   139  func (clus *Cluster) serveTesterServer() {
   140  	clus.lg.Info(
   141  		"started tester HTTP server",
   142  		zap.String("tester-address", clus.Tester.Addr),
   143  	)
   144  	err := clus.testerHTTPServer.ListenAndServe()
   145  	clus.lg.Info(
   146  		"tester HTTP server returned",
   147  		zap.String("tester-address", clus.Tester.Addr),
   148  		zap.Error(err),
   149  	)
   150  	if err != nil && err != http.ErrServerClosed {
   151  		clus.lg.Fatal("tester HTTP errored", zap.Error(err))
   152  	}
   153  }
   154  
   155  func (clus *Cluster) updateCases() {
   156  	for _, cs := range clus.Tester.Cases {
   157  		switch cs {
   158  		case "SIGTERM_ONE_FOLLOWER":
   159  			clus.cases = append(clus.cases,
   160  				new_Case_SIGTERM_ONE_FOLLOWER(clus))
   161  		case "SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   162  			clus.cases = append(clus.cases,
   163  				new_Case_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
   164  		case "SIGTERM_LEADER":
   165  			clus.cases = append(clus.cases,
   166  				new_Case_SIGTERM_LEADER(clus))
   167  		case "SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   168  			clus.cases = append(clus.cases,
   169  				new_Case_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))
   170  		case "SIGTERM_QUORUM":
   171  			clus.cases = append(clus.cases,
   172  				new_Case_SIGTERM_QUORUM(clus))
   173  		case "SIGTERM_ALL":
   174  			clus.cases = append(clus.cases,
   175  				new_Case_SIGTERM_ALL(clus))
   176  
   177  		case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER":
   178  			clus.cases = append(clus.cases,
   179  				new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus))
   180  		case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   181  			clus.cases = append(clus.cases,
   182  				new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
   183  		case "SIGQUIT_AND_REMOVE_LEADER":
   184  			clus.cases = append(clus.cases,
   185  				new_Case_SIGQUIT_AND_REMOVE_LEADER(clus))
   186  		case "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   187  			clus.cases = append(clus.cases,
   188  				new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))
   189  		case "SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH":
   190  			clus.cases = append(clus.cases,
   191  				new_Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH(clus))
   192  
   193  		case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
   194  			clus.cases = append(clus.cases,
   195  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus))
   196  		case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   197  			clus.cases = append(clus.cases,
   198  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT())
   199  		case "BLACKHOLE_PEER_PORT_TX_RX_LEADER":
   200  			clus.cases = append(clus.cases,
   201  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus))
   202  		case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   203  			clus.cases = append(clus.cases,
   204  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT())
   205  		case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM":
   206  			clus.cases = append(clus.cases,
   207  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus))
   208  		case "BLACKHOLE_PEER_PORT_TX_RX_ALL":
   209  			clus.cases = append(clus.cases,
   210  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus))
   211  
   212  		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
   213  			clus.cases = append(clus.cases,
   214  				new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, false))
   215  		case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
   216  			clus.cases = append(clus.cases,
   217  				new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, true))
   218  		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   219  			clus.cases = append(clus.cases,
   220  				new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
   221  		case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   222  			clus.cases = append(clus.cases,
   223  				new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
   224  		case "DELAY_PEER_PORT_TX_RX_LEADER":
   225  			clus.cases = append(clus.cases,
   226  				new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, false))
   227  		case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER":
   228  			clus.cases = append(clus.cases,
   229  				new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, true))
   230  		case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   231  			clus.cases = append(clus.cases,
   232  				new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
   233  		case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   234  			clus.cases = append(clus.cases,
   235  				new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
   236  		case "DELAY_PEER_PORT_TX_RX_QUORUM":
   237  			clus.cases = append(clus.cases,
   238  				new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, false))
   239  		case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM":
   240  			clus.cases = append(clus.cases,
   241  				new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, true))
   242  		case "DELAY_PEER_PORT_TX_RX_ALL":
   243  			clus.cases = append(clus.cases,
   244  				new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, false))
   245  		case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL":
   246  			clus.cases = append(clus.cases,
   247  				new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, true))
   248  
   249  		case "NO_FAIL_WITH_STRESS":
   250  			clus.cases = append(clus.cases,
   251  				new_Case_NO_FAIL_WITH_STRESS(clus))
   252  		case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS":
   253  			clus.cases = append(clus.cases,
   254  				new_Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus))
   255  
   256  		case "EXTERNAL":
   257  			clus.cases = append(clus.cases,
   258  				new_Case_EXTERNAL(clus.Tester.ExternalExecPath))
   259  		case "FAILPOINTS":
   260  			fpFailures, fperr := failpointFailures(clus)
   261  			if len(fpFailures) == 0 {
   262  				clus.lg.Info("no failpoints found!", zap.Error(fperr))
   263  			}
   264  			clus.cases = append(clus.cases,
   265  				fpFailures...)
   266  		case "FAILPOINTS_WITH_DISK_IO_LATENCY":
   267  			fpFailures, fperr := failpointDiskIOFailures(clus)
   268  			if len(fpFailures) == 0 {
   269  				clus.lg.Info("no failpoints found!", zap.Error(fperr))
   270  			}
   271  			clus.cases = append(clus.cases,
   272  				fpFailures...)
   273  		}
   274  	}
   275  }
   276  
   277  func (clus *Cluster) listCases() (css []string) {
   278  	css = make([]string, len(clus.cases))
   279  	for i := range clus.cases {
   280  		css[i] = clus.cases[i].Desc()
   281  	}
   282  	return css
   283  }
   284  
   285  // UpdateDelayLatencyMs updates delay latency with random value
   286  // within election timeout.
   287  func (clus *Cluster) UpdateDelayLatencyMs() {
   288  	rand.Seed(time.Now().UnixNano())
   289  	clus.Tester.UpdatedDelayLatencyMs = uint32(rand.Int63n(clus.Members[0].Etcd.ElectionTimeoutMs))
   290  
   291  	minLatRv := clus.Tester.DelayLatencyMsRv + clus.Tester.DelayLatencyMsRv/5
   292  	if clus.Tester.UpdatedDelayLatencyMs <= minLatRv {
   293  		clus.Tester.UpdatedDelayLatencyMs += minLatRv
   294  	}
   295  }
   296  
   297  func (clus *Cluster) setStresserChecker() {
   298  	css := &compositeStresser{}
   299  	var lss []*leaseStresser
   300  	var rss []*runnerStresser
   301  	for _, m := range clus.Members {
   302  		sss := newStresser(clus, m)
   303  		css.stressers = append(css.stressers, &compositeStresser{sss})
   304  		for _, s := range sss {
   305  			if v, ok := s.(*leaseStresser); ok {
   306  				lss = append(lss, v)
   307  				clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint))
   308  			}
   309  			if v, ok := s.(*runnerStresser); ok {
   310  				rss = append(rss, v)
   311  				clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint))
   312  			}
   313  		}
   314  	}
   315  	clus.stresser = css
   316  
   317  	for _, cs := range clus.Tester.Checkers {
   318  		switch cs {
   319  		case "KV_HASH":
   320  			clus.checkers = append(clus.checkers, newKVHashChecker(clus))
   321  
   322  		case "LEASE_EXPIRE":
   323  			for _, ls := range lss {
   324  				clus.checkers = append(clus.checkers, newLeaseExpireChecker(ls))
   325  			}
   326  
   327  		case "RUNNER":
   328  			for _, rs := range rss {
   329  				clus.checkers = append(clus.checkers, newRunnerChecker(rs.etcdClientEndpoint, rs.errc))
   330  			}
   331  
   332  		case "NO_CHECK":
   333  			clus.checkers = append(clus.checkers, newNoChecker())
   334  
   335  		case "SHORT_TTL_LEASE_EXPIRE":
   336  			for _, ls := range lss {
   337  				clus.checkers = append(clus.checkers, newShortTTLLeaseExpireChecker(ls))
   338  			}
   339  		}
   340  	}
   341  	clus.lg.Info("updated stressers")
   342  }
   343  
   344  func (clus *Cluster) runCheckers(exceptions ...rpcpb.Checker) (err error) {
   345  	defer func() {
   346  		if err != nil {
   347  			return
   348  		}
   349  		if err = clus.updateRevision(); err != nil {
   350  			clus.lg.Warn(
   351  				"updateRevision failed",
   352  				zap.Error(err),
   353  			)
   354  			return
   355  		}
   356  	}()
   357  
   358  	exs := make(map[rpcpb.Checker]struct{})
   359  	for _, e := range exceptions {
   360  		exs[e] = struct{}{}
   361  	}
   362  	for _, chk := range clus.checkers {
   363  		clus.lg.Warn(
   364  			"consistency check START",
   365  			zap.String("checker", chk.Type().String()),
   366  			zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
   367  		)
   368  		err = chk.Check()
   369  		clus.lg.Warn(
   370  			"consistency check END",
   371  			zap.String("checker", chk.Type().String()),
   372  			zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
   373  			zap.Error(err),
   374  		)
   375  		if err != nil {
   376  			_, ok := exs[chk.Type()]
   377  			if !ok {
   378  				return err
   379  			}
   380  			clus.lg.Warn(
   381  				"consistency check SKIP FAIL",
   382  				zap.String("checker", chk.Type().String()),
   383  				zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
   384  				zap.Error(err),
   385  			)
   386  		}
   387  	}
   388  	return nil
   389  }
   390  
   391  // Send_INITIAL_START_ETCD bootstraps etcd cluster the very first time.
   392  // After this, just continue to call kill/restart.
   393  func (clus *Cluster) Send_INITIAL_START_ETCD() error {
   394  	// this is the only time that creates request from scratch
   395  	return clus.broadcast(rpcpb.Operation_INITIAL_START_ETCD)
   396  }
   397  
   398  // send_SIGQUIT_ETCD_AND_ARCHIVE_DATA sends "send_SIGQUIT_ETCD_AND_ARCHIVE_DATA" operation.
   399  func (clus *Cluster) send_SIGQUIT_ETCD_AND_ARCHIVE_DATA() error {
   400  	return clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA)
   401  }
   402  
   403  // send_RESTART_ETCD sends restart operation.
   404  func (clus *Cluster) send_RESTART_ETCD() error {
   405  	return clus.broadcast(rpcpb.Operation_RESTART_ETCD)
   406  }
   407  
   408  func (clus *Cluster) broadcast(op rpcpb.Operation) error {
   409  	var wg sync.WaitGroup
   410  	wg.Add(len(clus.agentStreams))
   411  
   412  	errc := make(chan error, len(clus.agentStreams))
   413  	for i := range clus.agentStreams {
   414  		go func(idx int, o rpcpb.Operation) {
   415  			defer wg.Done()
   416  			errc <- clus.sendOp(idx, o)
   417  		}(i, op)
   418  	}
   419  	wg.Wait()
   420  	close(errc)
   421  
   422  	var errs []string
   423  	for err := range errc {
   424  		if err == nil {
   425  			continue
   426  		}
   427  
   428  		if err != nil {
   429  			destroyed := false
   430  			if op == rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT {
   431  				if err == io.EOF {
   432  					destroyed = true
   433  				}
   434  				if strings.Contains(err.Error(),
   435  					"rpc error: code = Unavailable desc = transport is closing") {
   436  					// agent server has already closed;
   437  					// so this error is expected
   438  					destroyed = true
   439  				}
   440  				if strings.Contains(err.Error(),
   441  					"desc = os: process already finished") {
   442  					destroyed = true
   443  				}
   444  			}
   445  			if !destroyed {
   446  				errs = append(errs, err.Error())
   447  			}
   448  		}
   449  	}
   450  
   451  	if len(errs) == 0 {
   452  		return nil
   453  	}
   454  	return errors.New(strings.Join(errs, ", "))
   455  }
   456  
   457  func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error {
   458  	_, err := clus.sendOpWithResp(idx, op)
   459  	return err
   460  }
   461  
   462  func (clus *Cluster) sendOpWithResp(idx int, op rpcpb.Operation) (*rpcpb.Response, error) {
   463  	// maintain the initial member object
   464  	// throughout the test time
   465  	clus.agentRequests[idx] = &rpcpb.Request{
   466  		Operation: op,
   467  		Member:    clus.Members[idx],
   468  		Tester:    clus.Tester,
   469  	}
   470  
   471  	err := clus.agentStreams[idx].Send(clus.agentRequests[idx])
   472  	clus.lg.Info(
   473  		"sent request",
   474  		zap.String("operation", op.String()),
   475  		zap.String("to", clus.Members[idx].EtcdClientEndpoint),
   476  		zap.Error(err),
   477  	)
   478  	if err != nil {
   479  		return nil, err
   480  	}
   481  
   482  	resp, err := clus.agentStreams[idx].Recv()
   483  	if resp != nil {
   484  		clus.lg.Info(
   485  			"received response",
   486  			zap.String("operation", op.String()),
   487  			zap.String("from", clus.Members[idx].EtcdClientEndpoint),
   488  			zap.Bool("success", resp.Success),
   489  			zap.String("status", resp.Status),
   490  			zap.Error(err),
   491  		)
   492  	} else {
   493  		clus.lg.Info(
   494  			"received empty response",
   495  			zap.String("operation", op.String()),
   496  			zap.String("from", clus.Members[idx].EtcdClientEndpoint),
   497  			zap.Error(err),
   498  		)
   499  	}
   500  	if err != nil {
   501  		return nil, err
   502  	}
   503  
   504  	if !resp.Success {
   505  		return nil, errors.New(resp.Status)
   506  	}
   507  
   508  	m, secure := clus.Members[idx], false
   509  	for _, cu := range m.Etcd.AdvertiseClientURLs {
   510  		u, perr := url.Parse(cu)
   511  		if perr != nil {
   512  			return nil, perr
   513  		}
   514  		if u.Scheme == "https" { // TODO: handle unix
   515  			secure = true
   516  		}
   517  	}
   518  
   519  	// store TLS assets from agents/servers onto disk
   520  	if secure && (op == rpcpb.Operation_INITIAL_START_ETCD || op == rpcpb.Operation_RESTART_ETCD) {
   521  		dirClient := filepath.Join(
   522  			clus.Tester.DataDir,
   523  			clus.Members[idx].Etcd.Name,
   524  			"fixtures",
   525  			"client",
   526  		)
   527  		if err = fileutil.TouchDirAll(clus.lg, dirClient); err != nil {
   528  			return nil, err
   529  		}
   530  
   531  		clientCertData := []byte(resp.Member.ClientCertData)
   532  		if len(clientCertData) == 0 {
   533  			return nil, fmt.Errorf("got empty client cert from %q", m.EtcdClientEndpoint)
   534  		}
   535  		clientCertPath := filepath.Join(dirClient, "cert.pem")
   536  		if err = os.WriteFile(clientCertPath, clientCertData, 0644); err != nil { // overwrite if exists
   537  			return nil, err
   538  		}
   539  		resp.Member.ClientCertPath = clientCertPath
   540  		clus.lg.Info(
   541  			"saved client cert file",
   542  			zap.String("path", clientCertPath),
   543  		)
   544  
   545  		clientKeyData := []byte(resp.Member.ClientKeyData)
   546  		if len(clientKeyData) == 0 {
   547  			return nil, fmt.Errorf("got empty client key from %q", m.EtcdClientEndpoint)
   548  		}
   549  		clientKeyPath := filepath.Join(dirClient, "key.pem")
   550  		if err = os.WriteFile(clientKeyPath, clientKeyData, 0644); err != nil { // overwrite if exists
   551  			return nil, err
   552  		}
   553  		resp.Member.ClientKeyPath = clientKeyPath
   554  		clus.lg.Info(
   555  			"saved client key file",
   556  			zap.String("path", clientKeyPath),
   557  		)
   558  
   559  		clientTrustedCAData := []byte(resp.Member.ClientTrustedCAData)
   560  		if len(clientTrustedCAData) != 0 {
   561  			// TODO: disable this when auto TLS is deprecated
   562  			clientTrustedCAPath := filepath.Join(dirClient, "ca.pem")
   563  			if err = os.WriteFile(clientTrustedCAPath, clientTrustedCAData, 0644); err != nil { // overwrite if exists
   564  				return nil, err
   565  			}
   566  			resp.Member.ClientTrustedCAPath = clientTrustedCAPath
   567  			clus.lg.Info(
   568  				"saved client trusted CA file",
   569  				zap.String("path", clientTrustedCAPath),
   570  			)
   571  		}
   572  
   573  		// no need to store peer certs for tester clients
   574  
   575  		clus.Members[idx] = resp.Member
   576  	}
   577  
   578  	return resp, nil
   579  }
   580  
   581  // Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT terminates all tester connections to agents and etcd servers.
   582  func (clus *Cluster) Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() {
   583  	err := clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT)
   584  	if err != nil {
   585  		clus.lg.Warn("destroying etcd/agents FAIL", zap.Error(err))
   586  	} else {
   587  		clus.lg.Info("destroying etcd/agents PASS")
   588  	}
   589  
   590  	for i, conn := range clus.agentConns {
   591  		err := conn.Close()
   592  		clus.lg.Info("closed connection to agent", zap.String("agent-address", clus.Members[i].AgentAddr), zap.Error(err))
   593  	}
   594  
   595  	if clus.testerHTTPServer != nil {
   596  		ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
   597  		err := clus.testerHTTPServer.Shutdown(ctx)
   598  		cancel()
   599  		clus.lg.Info("closed tester HTTP server", zap.String("tester-address", clus.Tester.Addr), zap.Error(err))
   600  	}
   601  }
   602  
   603  // WaitHealth ensures all members are healthy
   604  // by writing a test key to etcd cluster.
   605  func (clus *Cluster) WaitHealth() error {
   606  	var err error
   607  	// wait 60s to check cluster health.
   608  	// TODO: set it to a reasonable value. It is set that high because
   609  	// follower may use long time to catch up the leader when reboot under
   610  	// reasonable workload (https://github.com/etcd-io/etcd/issues/2698)
   611  	for i := 0; i < 60; i++ {
   612  		for _, m := range clus.Members {
   613  			if err = m.WriteHealthKey(); err != nil {
   614  				clus.lg.Warn(
   615  					"health check FAIL",
   616  					zap.Int("retries", i),
   617  					zap.String("endpoint", m.EtcdClientEndpoint),
   618  					zap.Error(err),
   619  				)
   620  				break
   621  			}
   622  			clus.lg.Info(
   623  				"health check PASS",
   624  				zap.Int("retries", i),
   625  				zap.String("endpoint", m.EtcdClientEndpoint),
   626  			)
   627  		}
   628  		if err == nil {
   629  			clus.lg.Info("health check ALL PASS")
   630  			return nil
   631  		}
   632  		time.Sleep(time.Second)
   633  	}
   634  	return err
   635  }
   636  
   637  // GetLeader returns the index of leader and error if any.
   638  func (clus *Cluster) GetLeader() (int, error) {
   639  	for i, m := range clus.Members {
   640  		isLeader, err := m.IsLeader()
   641  		if isLeader || err != nil {
   642  			return i, err
   643  		}
   644  	}
   645  	return 0, fmt.Errorf("no leader found")
   646  }
   647  
   648  // maxRev returns the maximum revision found on the cluster.
   649  func (clus *Cluster) maxRev() (rev int64, err error) {
   650  	ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
   651  	defer cancel()
   652  	revc, errc := make(chan int64, len(clus.Members)), make(chan error, len(clus.Members))
   653  	for i := range clus.Members {
   654  		go func(m *rpcpb.Member) {
   655  			mrev, merr := m.Rev(ctx)
   656  			revc <- mrev
   657  			errc <- merr
   658  		}(clus.Members[i])
   659  	}
   660  	for i := 0; i < len(clus.Members); i++ {
   661  		if merr := <-errc; merr != nil {
   662  			err = merr
   663  		}
   664  		if mrev := <-revc; mrev > rev {
   665  			rev = mrev
   666  		}
   667  	}
   668  	return rev, err
   669  }
   670  
   671  func (clus *Cluster) getRevisionHash() (map[string]int64, map[string]int64, error) {
   672  	revs := make(map[string]int64)
   673  	hashes := make(map[string]int64)
   674  	for _, m := range clus.Members {
   675  		rev, hash, err := m.RevHash()
   676  		if err != nil {
   677  			return nil, nil, err
   678  		}
   679  		revs[m.EtcdClientEndpoint] = rev
   680  		hashes[m.EtcdClientEndpoint] = hash
   681  	}
   682  	return revs, hashes, nil
   683  }
   684  
   685  func (clus *Cluster) compactKV(rev int64, timeout time.Duration) (err error) {
   686  	if rev <= 0 {
   687  		return nil
   688  	}
   689  
   690  	for i, m := range clus.Members {
   691  		clus.lg.Info(
   692  			"compact START",
   693  			zap.String("endpoint", m.EtcdClientEndpoint),
   694  			zap.Int64("compact-revision", rev),
   695  			zap.Duration("timeout", timeout),
   696  		)
   697  		now := time.Now()
   698  		cerr := m.Compact(rev, timeout)
   699  		succeed := true
   700  		if cerr != nil {
   701  			if strings.Contains(cerr.Error(), "required revision has been compacted") && i > 0 {
   702  				clus.lg.Info(
   703  					"compact error is ignored",
   704  					zap.String("endpoint", m.EtcdClientEndpoint),
   705  					zap.Int64("compact-revision", rev),
   706  					zap.String("expected-error-msg", cerr.Error()),
   707  				)
   708  			} else {
   709  				clus.lg.Warn(
   710  					"compact FAIL",
   711  					zap.String("endpoint", m.EtcdClientEndpoint),
   712  					zap.Int64("compact-revision", rev),
   713  					zap.Error(cerr),
   714  				)
   715  				err = cerr
   716  				succeed = false
   717  			}
   718  		}
   719  
   720  		if succeed {
   721  			clus.lg.Info(
   722  				"compact PASS",
   723  				zap.String("endpoint", m.EtcdClientEndpoint),
   724  				zap.Int64("compact-revision", rev),
   725  				zap.Duration("timeout", timeout),
   726  				zap.Duration("took", time.Since(now)),
   727  			)
   728  		}
   729  	}
   730  	return err
   731  }
   732  
   733  func (clus *Cluster) checkCompact(rev int64) error {
   734  	if rev == 0 {
   735  		return nil
   736  	}
   737  	for _, m := range clus.Members {
   738  		if err := m.CheckCompact(rev); err != nil {
   739  			return err
   740  		}
   741  	}
   742  	return nil
   743  }
   744  
   745  func (clus *Cluster) defrag() error {
   746  	for _, m := range clus.Members {
   747  		if err := m.Defrag(); err != nil {
   748  			clus.lg.Warn(
   749  				"defrag FAIL",
   750  				zap.String("endpoint", m.EtcdClientEndpoint),
   751  				zap.Error(err),
   752  			)
   753  			return err
   754  		}
   755  		clus.lg.Info(
   756  			"defrag PASS",
   757  			zap.String("endpoint", m.EtcdClientEndpoint),
   758  		)
   759  	}
   760  	clus.lg.Info(
   761  		"defrag ALL PASS",
   762  		zap.Int("round", clus.rd),
   763  		zap.Int("case", clus.cs),
   764  		zap.Int("case-total", len(clus.cases)),
   765  	)
   766  	return nil
   767  }
   768  
   769  // GetCaseDelayDuration computes failure delay duration.
   770  func (clus *Cluster) GetCaseDelayDuration() time.Duration {
   771  	return time.Duration(clus.Tester.CaseDelayMs) * time.Millisecond
   772  }
   773  
   774  // Report reports the number of modified keys.
   775  func (clus *Cluster) Report() int64 {
   776  	return clus.stresser.ModifiedKeys()
   777  }