go.etcd.io/etcd@v3.3.27+incompatible/functional/tester/cluster.go (about)

     1  // Copyright 2018 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tester
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"io/ioutil"
    23  	"math/rand"
    24  	"net/http"
    25  	"net/url"
    26  	"path/filepath"
    27  	"strings"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/coreos/etcd/functional/rpcpb"
    32  	"github.com/coreos/etcd/pkg/debugutil"
    33  	"github.com/coreos/etcd/pkg/fileutil"
    34  
    35  	"github.com/prometheus/client_golang/prometheus/promhttp"
    36  	"go.uber.org/zap"
    37  	"golang.org/x/time/rate"
    38  	"google.golang.org/grpc"
    39  )
    40  
    41  // Cluster defines tester cluster.
    42  type Cluster struct {
    43  	lg *zap.Logger
    44  
    45  	agentConns    []*grpc.ClientConn
    46  	agentClients  []rpcpb.TransportClient
    47  	agentStreams  []rpcpb.Transport_TransportClient
    48  	agentRequests []*rpcpb.Request
    49  
    50  	testerHTTPServer *http.Server
    51  
    52  	Members []*rpcpb.Member `yaml:"agent-configs"`
    53  	Tester  *rpcpb.Tester   `yaml:"tester-config"`
    54  
    55  	cases []Case
    56  
    57  	rateLimiter *rate.Limiter
    58  	stresser    Stresser
    59  	checkers    []Checker
    60  
    61  	currentRevision int64
    62  	rd              int
    63  	cs              int
    64  }
    65  
    66  var dialOpts = []grpc.DialOption{
    67  	grpc.WithInsecure(),
    68  	grpc.WithTimeout(5 * time.Second),
    69  	grpc.WithBlock(),
    70  }
    71  
    72  // NewCluster creates a client from a tester configuration.
    73  func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
    74  	clus, err := read(lg, fpath)
    75  	if err != nil {
    76  		return nil, err
    77  	}
    78  
    79  	clus.agentConns = make([]*grpc.ClientConn, len(clus.Members))
    80  	clus.agentClients = make([]rpcpb.TransportClient, len(clus.Members))
    81  	clus.agentStreams = make([]rpcpb.Transport_TransportClient, len(clus.Members))
    82  	clus.agentRequests = make([]*rpcpb.Request, len(clus.Members))
    83  	clus.cases = make([]Case, 0)
    84  
    85  	for i, ap := range clus.Members {
    86  		var err error
    87  		clus.agentConns[i], err = grpc.Dial(ap.AgentAddr, dialOpts...)
    88  		if err != nil {
    89  			return nil, err
    90  		}
    91  		clus.agentClients[i] = rpcpb.NewTransportClient(clus.agentConns[i])
    92  		clus.lg.Info("connected", zap.String("agent-address", ap.AgentAddr))
    93  
    94  		clus.agentStreams[i], err = clus.agentClients[i].Transport(context.Background())
    95  		if err != nil {
    96  			return nil, err
    97  		}
    98  		clus.lg.Info("created stream", zap.String("agent-address", ap.AgentAddr))
    99  	}
   100  
   101  	mux := http.NewServeMux()
   102  	mux.Handle("/metrics", promhttp.Handler())
   103  	if clus.Tester.EnablePprof {
   104  		for p, h := range debugutil.PProfHandlers() {
   105  			mux.Handle(p, h)
   106  		}
   107  	}
   108  	clus.testerHTTPServer = &http.Server{
   109  		Addr:    clus.Tester.Addr,
   110  		Handler: mux,
   111  	}
   112  	go clus.serveTesterServer()
   113  
   114  	clus.updateCases()
   115  
   116  	clus.rateLimiter = rate.NewLimiter(
   117  		rate.Limit(int(clus.Tester.StressQPS)),
   118  		int(clus.Tester.StressQPS),
   119  	)
   120  
   121  	clus.setStresserChecker()
   122  
   123  	return clus, nil
   124  }
   125  
   126  // EtcdClientEndpoints returns all etcd client endpoints.
   127  func (clus *Cluster) EtcdClientEndpoints() (css []string) {
   128  	css = make([]string, len(clus.Members))
   129  	for i := range clus.Members {
   130  		css[i] = clus.Members[i].EtcdClientEndpoint
   131  	}
   132  	return css
   133  }
   134  
   135  func (clus *Cluster) serveTesterServer() {
   136  	clus.lg.Info(
   137  		"started tester HTTP server",
   138  		zap.String("tester-address", clus.Tester.Addr),
   139  	)
   140  	err := clus.testerHTTPServer.ListenAndServe()
   141  	clus.lg.Info(
   142  		"tester HTTP server returned",
   143  		zap.String("tester-address", clus.Tester.Addr),
   144  		zap.Error(err),
   145  	)
   146  	if err != nil && err != http.ErrServerClosed {
   147  		clus.lg.Fatal("tester HTTP errored", zap.Error(err))
   148  	}
   149  }
   150  
   151  func (clus *Cluster) updateCases() {
   152  	for _, cs := range clus.Tester.Cases {
   153  		switch cs {
   154  		case "SIGTERM_ONE_FOLLOWER":
   155  			clus.cases = append(clus.cases,
   156  				new_Case_SIGTERM_ONE_FOLLOWER(clus))
   157  		case "SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   158  			clus.cases = append(clus.cases,
   159  				new_Case_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
   160  		case "SIGTERM_LEADER":
   161  			clus.cases = append(clus.cases,
   162  				new_Case_SIGTERM_LEADER(clus))
   163  		case "SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   164  			clus.cases = append(clus.cases,
   165  				new_Case_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))
   166  		case "SIGTERM_QUORUM":
   167  			clus.cases = append(clus.cases,
   168  				new_Case_SIGTERM_QUORUM(clus))
   169  		case "SIGTERM_ALL":
   170  			clus.cases = append(clus.cases,
   171  				new_Case_SIGTERM_ALL(clus))
   172  
   173  		case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER":
   174  			clus.cases = append(clus.cases,
   175  				new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus))
   176  		case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   177  			clus.cases = append(clus.cases,
   178  				new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
   179  		case "SIGQUIT_AND_REMOVE_LEADER":
   180  			clus.cases = append(clus.cases,
   181  				new_Case_SIGQUIT_AND_REMOVE_LEADER(clus))
   182  		case "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   183  			clus.cases = append(clus.cases,
   184  				new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))
   185  		case "SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH":
   186  			clus.cases = append(clus.cases,
   187  				new_Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH(clus))
   188  
   189  		case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
   190  			clus.cases = append(clus.cases,
   191  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus))
   192  		case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   193  			clus.cases = append(clus.cases,
   194  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT())
   195  		case "BLACKHOLE_PEER_PORT_TX_RX_LEADER":
   196  			clus.cases = append(clus.cases,
   197  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus))
   198  		case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   199  			clus.cases = append(clus.cases,
   200  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT())
   201  		case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM":
   202  			clus.cases = append(clus.cases,
   203  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus))
   204  		case "BLACKHOLE_PEER_PORT_TX_RX_ALL":
   205  			clus.cases = append(clus.cases,
   206  				new_Case_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus))
   207  
   208  		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
   209  			clus.cases = append(clus.cases,
   210  				new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, false))
   211  		case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
   212  			clus.cases = append(clus.cases,
   213  				new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, true))
   214  		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   215  			clus.cases = append(clus.cases,
   216  				new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
   217  		case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
   218  			clus.cases = append(clus.cases,
   219  				new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
   220  		case "DELAY_PEER_PORT_TX_RX_LEADER":
   221  			clus.cases = append(clus.cases,
   222  				new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, false))
   223  		case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER":
   224  			clus.cases = append(clus.cases,
   225  				new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, true))
   226  		case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   227  			clus.cases = append(clus.cases,
   228  				new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
   229  		case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
   230  			clus.cases = append(clus.cases,
   231  				new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
   232  		case "DELAY_PEER_PORT_TX_RX_QUORUM":
   233  			clus.cases = append(clus.cases,
   234  				new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, false))
   235  		case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM":
   236  			clus.cases = append(clus.cases,
   237  				new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, true))
   238  		case "DELAY_PEER_PORT_TX_RX_ALL":
   239  			clus.cases = append(clus.cases,
   240  				new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, false))
   241  		case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL":
   242  			clus.cases = append(clus.cases,
   243  				new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, true))
   244  
   245  		case "NO_FAIL_WITH_STRESS":
   246  			clus.cases = append(clus.cases,
   247  				new_Case_NO_FAIL_WITH_STRESS(clus))
   248  		case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS":
   249  			clus.cases = append(clus.cases,
   250  				new_Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus))
   251  
   252  		case "EXTERNAL":
   253  			clus.cases = append(clus.cases,
   254  				new_Case_EXTERNAL(clus.Tester.ExternalExecPath))
   255  		case "FAILPOINTS":
   256  			fpFailures, fperr := failpointFailures(clus)
   257  			if len(fpFailures) == 0 {
   258  				clus.lg.Info("no failpoints found!", zap.Error(fperr))
   259  			}
   260  			clus.cases = append(clus.cases,
   261  				fpFailures...)
   262  		}
   263  	}
   264  }
   265  
   266  func (clus *Cluster) listCases() (css []string) {
   267  	css = make([]string, len(clus.cases))
   268  	for i := range clus.cases {
   269  		css[i] = clus.cases[i].Desc()
   270  	}
   271  	return css
   272  }
   273  
   274  // UpdateDelayLatencyMs updates delay latency with random value
   275  // within election timeout.
   276  func (clus *Cluster) UpdateDelayLatencyMs() {
   277  	rand.Seed(time.Now().UnixNano())
   278  	clus.Tester.UpdatedDelayLatencyMs = uint32(rand.Int63n(clus.Members[0].Etcd.ElectionTimeoutMs))
   279  
   280  	minLatRv := clus.Tester.DelayLatencyMsRv + clus.Tester.DelayLatencyMsRv/5
   281  	if clus.Tester.UpdatedDelayLatencyMs <= minLatRv {
   282  		clus.Tester.UpdatedDelayLatencyMs += minLatRv
   283  	}
   284  }
   285  
   286  func (clus *Cluster) setStresserChecker() {
   287  	css := &compositeStresser{}
   288  	lss := []*leaseStresser{}
   289  	rss := []*runnerStresser{}
   290  	for _, m := range clus.Members {
   291  		sss := newStresser(clus, m)
   292  		css.stressers = append(css.stressers, &compositeStresser{sss})
   293  		for _, s := range sss {
   294  			if v, ok := s.(*leaseStresser); ok {
   295  				lss = append(lss, v)
   296  				clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint))
   297  			}
   298  			if v, ok := s.(*runnerStresser); ok {
   299  				rss = append(rss, v)
   300  				clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint))
   301  			}
   302  		}
   303  	}
   304  	clus.stresser = css
   305  
   306  	for _, cs := range clus.Tester.Checkers {
   307  		switch cs {
   308  		case "KV_HASH":
   309  			clus.checkers = append(clus.checkers, newKVHashChecker(clus))
   310  
   311  		case "LEASE_EXPIRE":
   312  			for _, ls := range lss {
   313  				clus.checkers = append(clus.checkers, newLeaseExpireChecker(ls))
   314  			}
   315  
   316  		case "RUNNER":
   317  			for _, rs := range rss {
   318  				clus.checkers = append(clus.checkers, newRunnerChecker(rs.etcdClientEndpoint, rs.errc))
   319  			}
   320  
   321  		case "NO_CHECK":
   322  			clus.checkers = append(clus.checkers, newNoChecker())
   323  		}
   324  	}
   325  	clus.lg.Info("updated stressers")
   326  }
   327  
   328  func (clus *Cluster) runCheckers(exceptions ...rpcpb.Checker) (err error) {
   329  	defer func() {
   330  		if err != nil {
   331  			return
   332  		}
   333  		if err = clus.updateRevision(); err != nil {
   334  			clus.lg.Warn(
   335  				"updateRevision failed",
   336  				zap.Error(err),
   337  			)
   338  			return
   339  		}
   340  	}()
   341  
   342  	exs := make(map[rpcpb.Checker]struct{})
   343  	for _, e := range exceptions {
   344  		exs[e] = struct{}{}
   345  	}
   346  	for _, chk := range clus.checkers {
   347  		clus.lg.Warn(
   348  			"consistency check START",
   349  			zap.String("checker", chk.Type().String()),
   350  			zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
   351  		)
   352  		err = chk.Check()
   353  		clus.lg.Warn(
   354  			"consistency check END",
   355  			zap.String("checker", chk.Type().String()),
   356  			zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
   357  			zap.Error(err),
   358  		)
   359  		if err != nil {
   360  			_, ok := exs[chk.Type()]
   361  			if !ok {
   362  				return err
   363  			}
   364  			clus.lg.Warn(
   365  				"consistency check SKIP FAIL",
   366  				zap.String("checker", chk.Type().String()),
   367  				zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
   368  				zap.Error(err),
   369  			)
   370  		}
   371  	}
   372  	return nil
   373  }
   374  
   375  // Send_INITIAL_START_ETCD bootstraps etcd cluster the very first time.
   376  // After this, just continue to call kill/restart.
   377  func (clus *Cluster) Send_INITIAL_START_ETCD() error {
   378  	// this is the only time that creates request from scratch
   379  	return clus.broadcast(rpcpb.Operation_INITIAL_START_ETCD)
   380  }
   381  
   382  // send_SIGQUIT_ETCD_AND_ARCHIVE_DATA sends "send_SIGQUIT_ETCD_AND_ARCHIVE_DATA" operation.
   383  func (clus *Cluster) send_SIGQUIT_ETCD_AND_ARCHIVE_DATA() error {
   384  	return clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA)
   385  }
   386  
   387  // send_RESTART_ETCD sends restart operation.
   388  func (clus *Cluster) send_RESTART_ETCD() error {
   389  	return clus.broadcast(rpcpb.Operation_RESTART_ETCD)
   390  }
   391  
   392  func (clus *Cluster) broadcast(op rpcpb.Operation) error {
   393  	var wg sync.WaitGroup
   394  	wg.Add(len(clus.agentStreams))
   395  
   396  	errc := make(chan error, len(clus.agentStreams))
   397  	for i := range clus.agentStreams {
   398  		go func(idx int, o rpcpb.Operation) {
   399  			defer wg.Done()
   400  			errc <- clus.sendOp(idx, o)
   401  		}(i, op)
   402  	}
   403  	wg.Wait()
   404  	close(errc)
   405  
   406  	errs := []string{}
   407  	for err := range errc {
   408  		if err == nil {
   409  			continue
   410  		}
   411  
   412  		if err != nil {
   413  			destroyed := false
   414  			if op == rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT {
   415  				if err == io.EOF {
   416  					destroyed = true
   417  				}
   418  				if strings.Contains(err.Error(),
   419  					"rpc error: code = Unavailable desc = transport is closing") {
   420  					// agent server has already closed;
   421  					// so this error is expected
   422  					destroyed = true
   423  				}
   424  				if strings.Contains(err.Error(),
   425  					"desc = os: process already finished") {
   426  					destroyed = true
   427  				}
   428  			}
   429  			if !destroyed {
   430  				errs = append(errs, err.Error())
   431  			}
   432  		}
   433  	}
   434  
   435  	if len(errs) == 0 {
   436  		return nil
   437  	}
   438  	return errors.New(strings.Join(errs, ", "))
   439  }
   440  
   441  func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error {
   442  	_, err := clus.sendOpWithResp(idx, op)
   443  	return err
   444  }
   445  
   446  func (clus *Cluster) sendOpWithResp(idx int, op rpcpb.Operation) (*rpcpb.Response, error) {
   447  	// maintain the initial member object
   448  	// throughout the test time
   449  	clus.agentRequests[idx] = &rpcpb.Request{
   450  		Operation: op,
   451  		Member:    clus.Members[idx],
   452  		Tester:    clus.Tester,
   453  	}
   454  
   455  	err := clus.agentStreams[idx].Send(clus.agentRequests[idx])
   456  	clus.lg.Info(
   457  		"sent request",
   458  		zap.String("operation", op.String()),
   459  		zap.String("to", clus.Members[idx].EtcdClientEndpoint),
   460  		zap.Error(err),
   461  	)
   462  	if err != nil {
   463  		return nil, err
   464  	}
   465  
   466  	resp, err := clus.agentStreams[idx].Recv()
   467  	if resp != nil {
   468  		clus.lg.Info(
   469  			"received response",
   470  			zap.String("operation", op.String()),
   471  			zap.String("from", clus.Members[idx].EtcdClientEndpoint),
   472  			zap.Bool("success", resp.Success),
   473  			zap.String("status", resp.Status),
   474  			zap.Error(err),
   475  		)
   476  	} else {
   477  		clus.lg.Info(
   478  			"received empty response",
   479  			zap.String("operation", op.String()),
   480  			zap.String("from", clus.Members[idx].EtcdClientEndpoint),
   481  			zap.Error(err),
   482  		)
   483  	}
   484  	if err != nil {
   485  		return nil, err
   486  	}
   487  
   488  	if !resp.Success {
   489  		return nil, errors.New(resp.Status)
   490  	}
   491  
   492  	m, secure := clus.Members[idx], false
   493  	for _, cu := range m.Etcd.AdvertiseClientURLs {
   494  		u, err := url.Parse(cu)
   495  		if err != nil {
   496  			return nil, err
   497  		}
   498  		if u.Scheme == "https" { // TODO: handle unix
   499  			secure = true
   500  		}
   501  	}
   502  
   503  	// store TLS assets from agents/servers onto disk
   504  	if secure && (op == rpcpb.Operation_INITIAL_START_ETCD || op == rpcpb.Operation_RESTART_ETCD) {
   505  		dirClient := filepath.Join(
   506  			clus.Tester.DataDir,
   507  			clus.Members[idx].Etcd.Name,
   508  			"fixtures",
   509  			"client",
   510  		)
   511  		if err = fileutil.TouchDirAll(dirClient); err != nil {
   512  			return nil, err
   513  		}
   514  
   515  		clientCertData := []byte(resp.Member.ClientCertData)
   516  		if len(clientCertData) == 0 {
   517  			return nil, fmt.Errorf("got empty client cert from %q", m.EtcdClientEndpoint)
   518  		}
   519  		clientCertPath := filepath.Join(dirClient, "cert.pem")
   520  		if err = ioutil.WriteFile(clientCertPath, clientCertData, 0644); err != nil { // overwrite if exists
   521  			return nil, err
   522  		}
   523  		resp.Member.ClientCertPath = clientCertPath
   524  		clus.lg.Info(
   525  			"saved client cert file",
   526  			zap.String("path", clientCertPath),
   527  		)
   528  
   529  		clientKeyData := []byte(resp.Member.ClientKeyData)
   530  		if len(clientKeyData) == 0 {
   531  			return nil, fmt.Errorf("got empty client key from %q", m.EtcdClientEndpoint)
   532  		}
   533  		clientKeyPath := filepath.Join(dirClient, "key.pem")
   534  		if err = ioutil.WriteFile(clientKeyPath, clientKeyData, 0644); err != nil { // overwrite if exists
   535  			return nil, err
   536  		}
   537  		resp.Member.ClientKeyPath = clientKeyPath
   538  		clus.lg.Info(
   539  			"saved client key file",
   540  			zap.String("path", clientKeyPath),
   541  		)
   542  
   543  		clientTrustedCAData := []byte(resp.Member.ClientTrustedCAData)
   544  		if len(clientTrustedCAData) != 0 {
   545  			// TODO: disable this when auto TLS is deprecated
   546  			clientTrustedCAPath := filepath.Join(dirClient, "ca.pem")
   547  			if err = ioutil.WriteFile(clientTrustedCAPath, clientTrustedCAData, 0644); err != nil { // overwrite if exists
   548  				return nil, err
   549  			}
   550  			resp.Member.ClientTrustedCAPath = clientTrustedCAPath
   551  			clus.lg.Info(
   552  				"saved client trusted CA file",
   553  				zap.String("path", clientTrustedCAPath),
   554  			)
   555  		}
   556  
   557  		// no need to store peer certs for tester clients
   558  
   559  		clus.Members[idx] = resp.Member
   560  	}
   561  
   562  	return resp, nil
   563  }
   564  
   565  // Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT terminates all tester connections to agents and etcd servers.
   566  func (clus *Cluster) Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() {
   567  	err := clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT)
   568  	if err != nil {
   569  		clus.lg.Warn("destroying etcd/agents FAIL", zap.Error(err))
   570  	} else {
   571  		clus.lg.Info("destroying etcd/agents PASS")
   572  	}
   573  
   574  	for i, conn := range clus.agentConns {
   575  		err := conn.Close()
   576  		clus.lg.Info("closed connection to agent", zap.String("agent-address", clus.Members[i].AgentAddr), zap.Error(err))
   577  	}
   578  
   579  	if clus.testerHTTPServer != nil {
   580  		ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
   581  		err := clus.testerHTTPServer.Shutdown(ctx)
   582  		cancel()
   583  		clus.lg.Info("closed tester HTTP server", zap.String("tester-address", clus.Tester.Addr), zap.Error(err))
   584  	}
   585  }
   586  
   587  // WaitHealth ensures all members are healthy
   588  // by writing a test key to etcd cluster.
   589  func (clus *Cluster) WaitHealth() error {
   590  	var err error
   591  	// wait 60s to check cluster health.
   592  	// TODO: set it to a reasonable value. It is set that high because
   593  	// follower may use long time to catch up the leader when reboot under
   594  	// reasonable workload (https://github.com/coreos/etcd/issues/2698)
   595  	for i := 0; i < 60; i++ {
   596  		for _, m := range clus.Members {
   597  			if err = m.WriteHealthKey(); err != nil {
   598  				clus.lg.Warn(
   599  					"health check FAIL",
   600  					zap.Int("retries", i),
   601  					zap.String("endpoint", m.EtcdClientEndpoint),
   602  					zap.Error(err),
   603  				)
   604  				break
   605  			}
   606  			clus.lg.Info(
   607  				"health check PASS",
   608  				zap.Int("retries", i),
   609  				zap.String("endpoint", m.EtcdClientEndpoint),
   610  			)
   611  		}
   612  		if err == nil {
   613  			clus.lg.Info("health check ALL PASS")
   614  			return nil
   615  		}
   616  		time.Sleep(time.Second)
   617  	}
   618  	return err
   619  }
   620  
   621  // GetLeader returns the index of leader and error if any.
   622  func (clus *Cluster) GetLeader() (int, error) {
   623  	for i, m := range clus.Members {
   624  		isLeader, err := m.IsLeader()
   625  		if isLeader || err != nil {
   626  			return i, err
   627  		}
   628  	}
   629  	return 0, fmt.Errorf("no leader found")
   630  }
   631  
   632  // maxRev returns the maximum revision found on the cluster.
   633  func (clus *Cluster) maxRev() (rev int64, err error) {
   634  	ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
   635  	defer cancel()
   636  	revc, errc := make(chan int64, len(clus.Members)), make(chan error, len(clus.Members))
   637  	for i := range clus.Members {
   638  		go func(m *rpcpb.Member) {
   639  			mrev, merr := m.Rev(ctx)
   640  			revc <- mrev
   641  			errc <- merr
   642  		}(clus.Members[i])
   643  	}
   644  	for i := 0; i < len(clus.Members); i++ {
   645  		if merr := <-errc; merr != nil {
   646  			err = merr
   647  		}
   648  		if mrev := <-revc; mrev > rev {
   649  			rev = mrev
   650  		}
   651  	}
   652  	return rev, err
   653  }
   654  
   655  func (clus *Cluster) getRevisionHash() (map[string]int64, map[string]int64, error) {
   656  	revs := make(map[string]int64)
   657  	hashes := make(map[string]int64)
   658  	for _, m := range clus.Members {
   659  		rev, hash, err := m.RevHash()
   660  		if err != nil {
   661  			return nil, nil, err
   662  		}
   663  		revs[m.EtcdClientEndpoint] = rev
   664  		hashes[m.EtcdClientEndpoint] = hash
   665  	}
   666  	return revs, hashes, nil
   667  }
   668  
   669  func (clus *Cluster) compactKV(rev int64, timeout time.Duration) (err error) {
   670  	if rev <= 0 {
   671  		return nil
   672  	}
   673  
   674  	for i, m := range clus.Members {
   675  		clus.lg.Info(
   676  			"compact START",
   677  			zap.String("endpoint", m.EtcdClientEndpoint),
   678  			zap.Int64("compact-revision", rev),
   679  			zap.Duration("timeout", timeout),
   680  		)
   681  		now := time.Now()
   682  		cerr := m.Compact(rev, timeout)
   683  		succeed := true
   684  		if cerr != nil {
   685  			if strings.Contains(cerr.Error(), "required revision has been compacted") && i > 0 {
   686  				clus.lg.Info(
   687  					"compact error is ignored",
   688  					zap.String("endpoint", m.EtcdClientEndpoint),
   689  					zap.Int64("compact-revision", rev),
   690  					zap.Error(cerr),
   691  				)
   692  			} else {
   693  				clus.lg.Warn(
   694  					"compact FAIL",
   695  					zap.String("endpoint", m.EtcdClientEndpoint),
   696  					zap.Int64("compact-revision", rev),
   697  					zap.Error(cerr),
   698  				)
   699  				err = cerr
   700  				succeed = false
   701  			}
   702  		}
   703  
   704  		if succeed {
   705  			clus.lg.Info(
   706  				"compact PASS",
   707  				zap.String("endpoint", m.EtcdClientEndpoint),
   708  				zap.Int64("compact-revision", rev),
   709  				zap.Duration("timeout", timeout),
   710  				zap.Duration("took", time.Since(now)),
   711  			)
   712  		}
   713  	}
   714  	return err
   715  }
   716  
   717  func (clus *Cluster) checkCompact(rev int64) error {
   718  	if rev == 0 {
   719  		return nil
   720  	}
   721  	for _, m := range clus.Members {
   722  		if err := m.CheckCompact(rev); err != nil {
   723  			return err
   724  		}
   725  	}
   726  	return nil
   727  }
   728  
   729  func (clus *Cluster) defrag() error {
   730  	for _, m := range clus.Members {
   731  		if err := m.Defrag(); err != nil {
   732  			clus.lg.Warn(
   733  				"defrag FAIL",
   734  				zap.String("endpoint", m.EtcdClientEndpoint),
   735  				zap.Error(err),
   736  			)
   737  			return err
   738  		}
   739  		clus.lg.Info(
   740  			"defrag PASS",
   741  			zap.String("endpoint", m.EtcdClientEndpoint),
   742  		)
   743  	}
   744  	clus.lg.Info(
   745  		"defrag ALL PASS",
   746  		zap.Int("round", clus.rd),
   747  		zap.Int("case", clus.cs),
   748  		zap.Int("case-total", len(clus.cases)),
   749  	)
   750  	return nil
   751  }
   752  
   753  // GetCaseDelayDuration computes failure delay duration.
   754  func (clus *Cluster) GetCaseDelayDuration() time.Duration {
   755  	return time.Duration(clus.Tester.CaseDelayMs) * time.Millisecond
   756  }
   757  
   758  // Report reports the number of modified keys.
   759  func (clus *Cluster) Report() int64 {
   760  	return clus.stresser.ModifiedKeys()
   761  }