go.etcd.io/etcd@v3.3.27+incompatible/functional/tester/cluster_run.go (about)

     1  // Copyright 2018 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tester
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"time"
    21  
    22  	"github.com/coreos/etcd/functional/rpcpb"
    23  	"github.com/coreos/etcd/pkg/fileutil"
    24  
    25  	"go.uber.org/zap"
    26  )
    27  
    28  // compactQPS is rough number of compact requests per second.
    29  // Previous tests showed etcd can compact about 60,000 entries per second.
    30  const compactQPS = 50000
    31  
    32  // Run starts tester.
    33  func (clus *Cluster) Run() {
    34  	defer printReport()
    35  
    36  	if err := fileutil.TouchDirAll(clus.Tester.DataDir); err != nil {
    37  		clus.lg.Panic(
    38  			"failed to create test data directory",
    39  			zap.String("dir", clus.Tester.DataDir),
    40  			zap.Error(err),
    41  		)
    42  	}
    43  
    44  	var preModifiedKey int64
    45  	for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
    46  		roundTotalCounter.Inc()
    47  		clus.rd = round
    48  
    49  		if err := clus.doRound(); err != nil {
    50  			clus.lg.Warn(
    51  				"round FAIL",
    52  				zap.Int("round", clus.rd),
    53  				zap.Int("case", clus.cs),
    54  				zap.Int("case-total", len(clus.cases)),
    55  				zap.Error(err),
    56  			)
    57  			if clus.cleanup() != nil {
    58  				return
    59  			}
    60  			// reset preModifiedKey after clean up
    61  			preModifiedKey = 0
    62  			continue
    63  		}
    64  
    65  		// -1 so that logPrefix doesn't print out 'case'
    66  		clus.cs = -1
    67  
    68  		revToCompact := max(0, clus.currentRevision-10000)
    69  		currentModifiedKey := clus.stresser.ModifiedKeys()
    70  		modifiedKey := currentModifiedKey - preModifiedKey
    71  		preModifiedKey = currentModifiedKey
    72  		timeout := 10 * time.Second
    73  		timeout += time.Duration(modifiedKey/compactQPS) * time.Second
    74  		clus.lg.Info(
    75  			"compact START",
    76  			zap.Int("round", clus.rd),
    77  			zap.Int("case", clus.cs),
    78  			zap.Int("case-total", len(clus.cases)),
    79  			zap.Duration("timeout", timeout),
    80  		)
    81  		if err := clus.compact(revToCompact, timeout); err != nil {
    82  			clus.lg.Warn(
    83  				"compact FAIL",
    84  				zap.Int("round", clus.rd),
    85  				zap.Int("case", clus.cs),
    86  				zap.Int("case-total", len(clus.cases)),
    87  				zap.Error(err),
    88  			)
    89  			if err = clus.cleanup(); err != nil {
    90  				clus.lg.Warn(
    91  					"cleanup FAIL",
    92  					zap.Int("round", clus.rd),
    93  					zap.Int("case", clus.cs),
    94  					zap.Int("case-total", len(clus.cases)),
    95  					zap.Error(err),
    96  				)
    97  				return
    98  			}
    99  			// reset preModifiedKey after clean up
   100  			preModifiedKey = 0
   101  		}
   102  		if round > 0 && round%500 == 0 { // every 500 rounds
   103  			if err := clus.defrag(); err != nil {
   104  				clus.failed()
   105  				return
   106  			}
   107  		}
   108  	}
   109  
   110  	clus.lg.Info(
   111  		"functional-tester PASS",
   112  		zap.Int("round", clus.rd),
   113  		zap.Int("case", clus.cs),
   114  		zap.Int("case-total", len(clus.cases)),
   115  	)
   116  }
   117  
   118  func (clus *Cluster) doRound() error {
   119  	if clus.Tester.CaseShuffle {
   120  		clus.shuffleCases()
   121  	}
   122  
   123  	roundNow := time.Now()
   124  	clus.lg.Info(
   125  		"round START",
   126  		zap.Int("round", clus.rd),
   127  		zap.Int("case", clus.cs),
   128  		zap.Int("case-total", len(clus.cases)),
   129  		zap.Strings("cases", clus.listCases()),
   130  	)
   131  	for i, fa := range clus.cases {
   132  		clus.cs = i
   133  
   134  		caseTotal[fa.Desc()]++
   135  		caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
   136  
   137  		caseNow := time.Now()
   138  		clus.lg.Info(
   139  			"case START",
   140  			zap.Int("round", clus.rd),
   141  			zap.Int("case", clus.cs),
   142  			zap.Int("case-total", len(clus.cases)),
   143  			zap.String("desc", fa.Desc()),
   144  		)
   145  
   146  		clus.lg.Info("wait health before injecting failures")
   147  		if err := clus.WaitHealth(); err != nil {
   148  			return fmt.Errorf("wait full health error: %v", err)
   149  		}
   150  
   151  		stressStarted := false
   152  		fcase := fa.TestCase()
   153  		if fcase != rpcpb.Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
   154  			clus.lg.Info(
   155  				"stress START",
   156  				zap.Int("round", clus.rd),
   157  				zap.Int("case", clus.cs),
   158  				zap.Int("case-total", len(clus.cases)),
   159  				zap.String("desc", fa.Desc()),
   160  			)
   161  			if err := clus.stresser.Stress(); err != nil {
   162  				return fmt.Errorf("start stresser error: %v", err)
   163  			}
   164  			stressStarted = true
   165  		}
   166  
   167  		clus.lg.Info(
   168  			"inject START",
   169  			zap.Int("round", clus.rd),
   170  			zap.Int("case", clus.cs),
   171  			zap.Int("case-total", len(clus.cases)),
   172  			zap.String("desc", fa.Desc()),
   173  		)
   174  		if err := fa.Inject(clus); err != nil {
   175  			return fmt.Errorf("injection error: %v", err)
   176  		}
   177  
   178  		// if run local, recovering server may conflict
   179  		// with stressing client ports
   180  		// TODO: use unix for local tests
   181  		clus.lg.Info(
   182  			"recover START",
   183  			zap.Int("round", clus.rd),
   184  			zap.Int("case", clus.cs),
   185  			zap.Int("case-total", len(clus.cases)),
   186  			zap.String("desc", fa.Desc()),
   187  		)
   188  		if err := fa.Recover(clus); err != nil {
   189  			return fmt.Errorf("recovery error: %v", err)
   190  		}
   191  
   192  		if stressStarted {
   193  			clus.lg.Info(
   194  				"stress PAUSE",
   195  				zap.Int("round", clus.rd),
   196  				zap.Int("case", clus.cs),
   197  				zap.Int("case-total", len(clus.cases)),
   198  				zap.String("desc", fa.Desc()),
   199  			)
   200  			ems := clus.stresser.Pause()
   201  			if fcase == rpcpb.Case_NO_FAIL_WITH_STRESS && len(ems) > 0 {
   202  				ess := make([]string, 0, len(ems))
   203  				cnt := 0
   204  				for k, v := range ems {
   205  					ess = append(ess, fmt.Sprintf("%s (count: %d)", k, v))
   206  					cnt += v
   207  				}
   208  				clus.lg.Warn(
   209  					"expected no errors",
   210  					zap.String("desc", fa.Desc()),
   211  					zap.Strings("errors", ess),
   212  				)
   213  
   214  				// with network delay, some ongoing requests may fail
   215  				// only return error, if more than 10% of QPS requests fail
   216  				if cnt > int(clus.Tester.StressQPS)/10 {
   217  					return fmt.Errorf("expected no error in %q, got %q", fcase.String(), ess)
   218  				}
   219  			}
   220  		}
   221  
   222  		clus.lg.Info(
   223  			"health check START",
   224  			zap.Int("round", clus.rd),
   225  			zap.Int("case", clus.cs),
   226  			zap.Int("case-total", len(clus.cases)),
   227  			zap.String("desc", fa.Desc()),
   228  		)
   229  		if err := clus.WaitHealth(); err != nil {
   230  			return fmt.Errorf("wait full health error: %v", err)
   231  		}
   232  
   233  		checkerFailExceptions := []rpcpb.Checker{}
   234  		switch fcase {
   235  		case rpcpb.Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH:
   236  			// TODO: restore from snapshot
   237  			checkerFailExceptions = append(checkerFailExceptions, rpcpb.Checker_LEASE_EXPIRE)
   238  		}
   239  
   240  		clus.lg.Info(
   241  			"consistency check START",
   242  			zap.Int("round", clus.rd),
   243  			zap.Int("case", clus.cs),
   244  			zap.Int("case-total", len(clus.cases)),
   245  			zap.String("desc", fa.Desc()),
   246  		)
   247  		if err := clus.runCheckers(checkerFailExceptions...); err != nil {
   248  			return fmt.Errorf("consistency check error (%v)", err)
   249  		}
   250  		clus.lg.Info(
   251  			"consistency check PASS",
   252  			zap.Int("round", clus.rd),
   253  			zap.Int("case", clus.cs),
   254  			zap.Int("case-total", len(clus.cases)),
   255  			zap.String("desc", fa.Desc()),
   256  			zap.Duration("took", time.Since(caseNow)),
   257  		)
   258  	}
   259  
   260  	clus.lg.Info(
   261  		"round ALL PASS",
   262  		zap.Int("round", clus.rd),
   263  		zap.Strings("cases", clus.listCases()),
   264  		zap.Int("case-total", len(clus.cases)),
   265  		zap.Duration("took", time.Since(roundNow)),
   266  	)
   267  	return nil
   268  }
   269  
   270  func (clus *Cluster) updateRevision() error {
   271  	revs, _, err := clus.getRevisionHash()
   272  	for _, rev := range revs {
   273  		clus.currentRevision = rev
   274  		break // just need get one of the current revisions
   275  	}
   276  
   277  	clus.lg.Info(
   278  		"updated current revision",
   279  		zap.Int64("current-revision", clus.currentRevision),
   280  	)
   281  	return err
   282  }
   283  
   284  func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
   285  	if err = clus.compactKV(rev, timeout); err != nil {
   286  		clus.lg.Warn(
   287  			"compact FAIL",
   288  			zap.Int64("current-revision", clus.currentRevision),
   289  			zap.Int64("compact-revision", rev),
   290  			zap.Error(err),
   291  		)
   292  		return err
   293  	}
   294  	clus.lg.Info(
   295  		"compact DONE",
   296  		zap.Int64("current-revision", clus.currentRevision),
   297  		zap.Int64("compact-revision", rev),
   298  	)
   299  
   300  	if err = clus.checkCompact(rev); err != nil {
   301  		clus.lg.Warn(
   302  			"check compact FAIL",
   303  			zap.Int64("current-revision", clus.currentRevision),
   304  			zap.Int64("compact-revision", rev),
   305  			zap.Error(err),
   306  		)
   307  		return err
   308  	}
   309  	clus.lg.Info(
   310  		"check compact DONE",
   311  		zap.Int64("current-revision", clus.currentRevision),
   312  		zap.Int64("compact-revision", rev),
   313  	)
   314  
   315  	return nil
   316  }
   317  
   318  func (clus *Cluster) failed() {
   319  	clus.lg.Info(
   320  		"functional-tester FAIL",
   321  		zap.Int("round", clus.rd),
   322  		zap.Int("case", clus.cs),
   323  		zap.Int("case-total", len(clus.cases)),
   324  	)
   325  	clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
   326  
   327  	os.Exit(2)
   328  }
   329  
   330  func (clus *Cluster) cleanup() error {
   331  	if clus.Tester.ExitOnCaseFail {
   332  		defer clus.failed()
   333  	}
   334  
   335  	roundFailedTotalCounter.Inc()
   336  	desc := "compact/defrag"
   337  	if clus.cs != -1 {
   338  		desc = clus.cases[clus.cs].Desc()
   339  	}
   340  	caseFailedTotalCounter.WithLabelValues(desc).Inc()
   341  
   342  	clus.lg.Info(
   343  		"closing stressers before archiving failure data",
   344  		zap.Int("round", clus.rd),
   345  		zap.Int("case", clus.cs),
   346  		zap.Int("case-total", len(clus.cases)),
   347  	)
   348  	clus.stresser.Close()
   349  
   350  	if err := clus.send_SIGQUIT_ETCD_AND_ARCHIVE_DATA(); err != nil {
   351  		clus.lg.Warn(
   352  			"cleanup FAIL",
   353  			zap.Int("round", clus.rd),
   354  			zap.Int("case", clus.cs),
   355  			zap.Int("case-total", len(clus.cases)),
   356  			zap.Error(err),
   357  		)
   358  		return err
   359  	}
   360  	if err := clus.send_RESTART_ETCD(); err != nil {
   361  		clus.lg.Warn(
   362  			"restart FAIL",
   363  			zap.Int("round", clus.rd),
   364  			zap.Int("case", clus.cs),
   365  			zap.Int("case-total", len(clus.cases)),
   366  			zap.Error(err),
   367  		)
   368  		return err
   369  	}
   370  
   371  	clus.setStresserChecker()
   372  	return nil
   373  }