github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/tester/cluster_run.go (about)

     1  // Copyright 2018 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tester
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"testing"
    21  	"time"
    22  
    23  	"github.com/lfch/etcd-io/client/pkg/v3/fileutil"
    24  	"github.com/lfch/etcd-io/tests/v3/functional/rpcpb"
    25  
    26  	"go.uber.org/zap"
    27  )
    28  
    29  // compactQPS is rough number of compact requests per second.
    30  // Previous tests showed etcd can compact about 60,000 entries per second.
    31  const compactQPS = 50000
    32  
    33  // Run starts tester.
    34  func (clus *Cluster) Run(t *testing.T) {
    35  	defer printReport()
    36  
    37  	// updateCases must be executed after etcd is started, because the FAILPOINTS case
    38  	// needs to obtain all the failpoints from the etcd member.
    39  	clus.updateCases()
    40  
    41  	if err := fileutil.TouchDirAll(clus.lg, clus.Tester.DataDir); err != nil {
    42  		clus.lg.Panic(
    43  			"failed to create test data directory",
    44  			zap.String("dir", clus.Tester.DataDir),
    45  			zap.Error(err),
    46  		)
    47  	}
    48  
    49  	var preModifiedKey int64
    50  	for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
    51  		t.Run(fmt.Sprintf("round:%v", round), func(t *testing.T) {
    52  			preModifiedKey = clus.doRoundAndCompact(t, round, preModifiedKey)
    53  		})
    54  
    55  		if round > 0 && round%500 == 0 { // every 500 rounds
    56  			t.Logf("Defragmenting in round: %v", round)
    57  			if err := clus.defrag(); err != nil {
    58  				clus.failed(err)
    59  				return
    60  			}
    61  		}
    62  	}
    63  
    64  	clus.lg.Info(
    65  		"functional-tester PASS",
    66  		zap.Int("round", clus.rd),
    67  		zap.Int("case", clus.cs),
    68  		zap.Int("case-total", len(clus.cases)),
    69  	)
    70  }
    71  
    72  func (clus *Cluster) doRoundAndCompact(t *testing.T, round int, preModifiedKey int64) (postModifiedKey int64) {
    73  	roundTotalCounter.Inc()
    74  	clus.rd = round
    75  
    76  	if err := clus.doRound(t); err != nil {
    77  		clus.lg.Error(
    78  			"round FAIL",
    79  			zap.Int("round", clus.rd),
    80  			zap.Int("case", clus.cs),
    81  			zap.Int("case-total", len(clus.cases)),
    82  			zap.Error(err),
    83  		)
    84  		if clus.cleanup(err) != nil {
    85  			return
    86  		}
    87  		// reset preModifiedKey after clean up
    88  		postModifiedKey = 0
    89  		return
    90  	}
    91  
    92  	// -1 so that logPrefix doesn't print out 'case'
    93  	clus.cs = -1
    94  
    95  	revToCompact := max(0, clus.currentRevision-10000)
    96  	currentModifiedKey := clus.stresser.ModifiedKeys()
    97  	modifiedKey := currentModifiedKey - preModifiedKey
    98  	timeout := 10 * time.Second
    99  	timeout += time.Duration(modifiedKey/compactQPS) * time.Second
   100  	clus.lg.Info(
   101  		"compact START",
   102  		zap.Int("round", clus.rd),
   103  		zap.Int("case", clus.cs),
   104  		zap.Int("case-total", len(clus.cases)),
   105  		zap.Duration("timeout", timeout),
   106  	)
   107  	if err := clus.compact(revToCompact, timeout); err != nil {
   108  		clus.lg.Warn(
   109  			"compact FAIL",
   110  			zap.Int("round", clus.rd),
   111  			zap.Int("case", clus.cs),
   112  			zap.Int("case-total", len(clus.cases)),
   113  			zap.Error(err),
   114  		)
   115  		if err = clus.cleanup(err); err != nil {
   116  			clus.lg.Warn(
   117  				"cleanup FAIL",
   118  				zap.Int("round", clus.rd),
   119  				zap.Int("case", clus.cs),
   120  				zap.Int("case-total", len(clus.cases)),
   121  				zap.Error(err),
   122  			)
   123  			return
   124  		}
   125  		// reset preModifiedKey after clean up
   126  		return 0
   127  	}
   128  	return currentModifiedKey
   129  }
   130  
   131  func (clus *Cluster) doRound(t *testing.T) error {
   132  	if clus.Tester.CaseShuffle {
   133  		clus.shuffleCases()
   134  	}
   135  
   136  	roundNow := time.Now()
   137  	clus.lg.Info(
   138  		"round START",
   139  		zap.Int("round", clus.rd),
   140  		zap.Int("case-total", len(clus.cases)),
   141  		zap.Strings("cases", clus.listCases()),
   142  	)
   143  	for i, fa := range clus.cases {
   144  		clus.cs = i
   145  		t.Run(fmt.Sprintf("%v_%s", i, fa.TestCase()),
   146  			func(t *testing.T) {
   147  				clus.doTestCase(t, fa)
   148  			})
   149  	}
   150  
   151  	clus.lg.Info(
   152  		"round ALL PASS",
   153  		zap.Int("round", clus.rd),
   154  		zap.Strings("cases", clus.listCases()),
   155  		zap.Int("case-total", len(clus.cases)),
   156  		zap.Duration("took", time.Since(roundNow)),
   157  	)
   158  	return nil
   159  }
   160  
   161  func (clus *Cluster) doTestCase(t *testing.T, fa Case) {
   162  	caseTotal[fa.Desc()]++
   163  	caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
   164  
   165  	caseNow := time.Now()
   166  	clus.lg.Info(
   167  		"case START",
   168  		zap.Int("round", clus.rd),
   169  		zap.Int("case", clus.cs),
   170  		zap.Int("case-total", len(clus.cases)),
   171  		zap.String("desc", fa.Desc()),
   172  	)
   173  
   174  	clus.lg.Info("wait health before injecting failures")
   175  	if err := clus.WaitHealth(); err != nil {
   176  		t.Fatalf("wait full health error: %v", err)
   177  	}
   178  
   179  	stressStarted := false
   180  	fcase := fa.TestCase()
   181  	if fcase != rpcpb.Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
   182  		clus.lg.Info(
   183  			"stress START",
   184  			zap.Int("round", clus.rd),
   185  			zap.Int("case", clus.cs),
   186  			zap.Int("case-total", len(clus.cases)),
   187  			zap.String("desc", fa.Desc()),
   188  		)
   189  		if err := clus.stresser.Stress(); err != nil {
   190  			t.Fatalf("start stresser error: %v", err)
   191  		}
   192  		stressStarted = true
   193  	}
   194  
   195  	clus.lg.Info(
   196  		"inject START",
   197  		zap.Int("round", clus.rd),
   198  		zap.Int("case", clus.cs),
   199  		zap.Int("case-total", len(clus.cases)),
   200  		zap.String("desc", fa.Desc()),
   201  	)
   202  	if err := fa.Inject(clus); err != nil {
   203  		t.Fatalf("injection error: %v", err)
   204  	}
   205  
   206  	// if run local, recovering server may conflict
   207  	// with stressing client ports
   208  	// TODO: use unix for local tests
   209  	clus.lg.Info(
   210  		"recover START",
   211  		zap.Int("round", clus.rd),
   212  		zap.Int("case", clus.cs),
   213  		zap.Int("case-total", len(clus.cases)),
   214  		zap.String("desc", fa.Desc()),
   215  	)
   216  	if err := fa.Recover(clus); err != nil {
   217  		t.Fatalf("recovery error: %v", err)
   218  	}
   219  
   220  	if stressStarted {
   221  		clus.lg.Info(
   222  			"stress PAUSE",
   223  			zap.Int("round", clus.rd),
   224  			zap.Int("case", clus.cs),
   225  			zap.Int("case-total", len(clus.cases)),
   226  			zap.String("desc", fa.Desc()),
   227  		)
   228  		ems := clus.stresser.Pause()
   229  		if fcase == rpcpb.Case_NO_FAIL_WITH_STRESS && len(ems) > 0 {
   230  			ess := make([]string, 0, len(ems))
   231  			cnt := 0
   232  			for k, v := range ems {
   233  				ess = append(ess, fmt.Sprintf("%s (count: %d)", k, v))
   234  				cnt += v
   235  			}
   236  			clus.lg.Warn(
   237  				"expected no errors",
   238  				zap.String("desc", fa.Desc()),
   239  				zap.Strings("errors", ess),
   240  			)
   241  
   242  			// with network delay, some ongoing requests may fail
   243  			// only return error, if more than 30% of QPS requests fail
   244  			if cnt > int(float64(clus.Tester.StressQPS)*0.3) {
   245  				t.Fatalf("expected no error in %q, got %q", fcase.String(), ess)
   246  			}
   247  		}
   248  	}
   249  
   250  	clus.lg.Info(
   251  		"health check START",
   252  		zap.Int("round", clus.rd),
   253  		zap.Int("case", clus.cs),
   254  		zap.Int("case-total", len(clus.cases)),
   255  		zap.String("desc", fa.Desc()),
   256  	)
   257  	if err := clus.WaitHealth(); err != nil {
   258  		t.Fatalf("wait full health error: %v", err)
   259  	}
   260  
   261  	var checkerFailExceptions []rpcpb.Checker
   262  	switch fcase {
   263  	case rpcpb.Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH:
   264  		// TODO: restore from snapshot
   265  		checkerFailExceptions = append(checkerFailExceptions, rpcpb.Checker_LEASE_EXPIRE)
   266  	}
   267  
   268  	clus.lg.Info(
   269  		"consistency check START",
   270  		zap.Int("round", clus.rd),
   271  		zap.Int("case", clus.cs),
   272  		zap.Int("case-total", len(clus.cases)),
   273  		zap.String("desc", fa.Desc()),
   274  	)
   275  	if err := clus.runCheckers(checkerFailExceptions...); err != nil {
   276  		t.Fatalf("consistency check error (%v)", err)
   277  	}
   278  	clus.lg.Info(
   279  		"consistency check PASS",
   280  		zap.Int("round", clus.rd),
   281  		zap.Int("case", clus.cs),
   282  		zap.Int("case-total", len(clus.cases)),
   283  		zap.String("desc", fa.Desc()),
   284  		zap.Duration("took", time.Since(caseNow)),
   285  	)
   286  }
   287  
   288  func (clus *Cluster) updateRevision() error {
   289  	revs, _, err := clus.getRevisionHash()
   290  	for _, rev := range revs {
   291  		clus.currentRevision = rev
   292  		break // just need get one of the current revisions
   293  	}
   294  
   295  	clus.lg.Info(
   296  		"updated current revision",
   297  		zap.Int64("current-revision", clus.currentRevision),
   298  	)
   299  	return err
   300  }
   301  
   302  func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
   303  	if err = clus.compactKV(rev, timeout); err != nil {
   304  		clus.lg.Warn(
   305  			"compact FAIL",
   306  			zap.Int64("current-revision", clus.currentRevision),
   307  			zap.Int64("compact-revision", rev),
   308  			zap.Error(err),
   309  		)
   310  		return err
   311  	}
   312  	clus.lg.Info(
   313  		"compact DONE",
   314  		zap.Int64("current-revision", clus.currentRevision),
   315  		zap.Int64("compact-revision", rev),
   316  	)
   317  
   318  	if err = clus.checkCompact(rev); err != nil {
   319  		clus.lg.Warn(
   320  			"check compact FAIL",
   321  			zap.Int64("current-revision", clus.currentRevision),
   322  			zap.Int64("compact-revision", rev),
   323  			zap.Error(err),
   324  		)
   325  		return err
   326  	}
   327  	clus.lg.Info(
   328  		"check compact DONE",
   329  		zap.Int64("current-revision", clus.currentRevision),
   330  		zap.Int64("compact-revision", rev),
   331  	)
   332  
   333  	return nil
   334  }
   335  
   336  func (clus *Cluster) failed(err error) {
   337  	clus.lg.Error(
   338  		"functional-tester FAIL",
   339  		zap.Int("round", clus.rd),
   340  		zap.Int("case", clus.cs),
   341  		zap.Int("case-total", len(clus.cases)),
   342  		zap.Error(err),
   343  	)
   344  	clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
   345  
   346  	os.Exit(2)
   347  }
   348  
   349  func (clus *Cluster) cleanup(err error) error {
   350  	if clus.Tester.ExitOnCaseFail {
   351  		defer clus.failed(err)
   352  	}
   353  
   354  	roundFailedTotalCounter.Inc()
   355  	desc := "compact/defrag"
   356  	if clus.cs != -1 {
   357  		desc = clus.cases[clus.cs].Desc()
   358  	}
   359  	caseFailedTotalCounter.WithLabelValues(desc).Inc()
   360  
   361  	clus.lg.Info(
   362  		"closing stressers before archiving failure data",
   363  		zap.Int("round", clus.rd),
   364  		zap.Int("case", clus.cs),
   365  		zap.Int("case-total", len(clus.cases)),
   366  	)
   367  	clus.stresser.Close()
   368  
   369  	if err := clus.send_SIGQUIT_ETCD_AND_ARCHIVE_DATA(); err != nil {
   370  		clus.lg.Warn(
   371  			"cleanup FAIL",
   372  			zap.Int("round", clus.rd),
   373  			zap.Int("case", clus.cs),
   374  			zap.Int("case-total", len(clus.cases)),
   375  			zap.Error(err),
   376  		)
   377  		return err
   378  	}
   379  	if err := clus.send_RESTART_ETCD(); err != nil {
   380  		clus.lg.Warn(
   381  			"restart FAIL",
   382  			zap.Int("round", clus.rd),
   383  			zap.Int("case", clus.cs),
   384  			zap.Int("case-total", len(clus.cases)),
   385  			zap.Error(err),
   386  		)
   387  		return err
   388  	}
   389  
   390  	clus.setStresserChecker()
   391  	return nil
   392  }