github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/tester/case.go (about)

     1  // Copyright 2018 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tester
    16  
    17  import (
    18  	"fmt"
    19  	"math/rand"
    20  	"time"
    21  
    22  	"github.com/lfch/etcd-io/tests/v3/functional/rpcpb"
    23  
    24  	"go.uber.org/zap"
    25  )
    26  
    27  // Case defines failure/test injection interface.
    28  // To add a test case:
    29  //  1. implement "Case" interface
    30  //  2. define fail case name in "rpcpb.Case"
    31  type Case interface {
    32  	// Inject injects the failure into the testing cluster at the given
    33  	// round. When calling the function, the cluster should be in health.
    34  	Inject(clus *Cluster) error
    35  	// Recover recovers the injected failure caused by the injection of the
    36  	// given round and wait for the recovery of the testing cluster.
    37  	Recover(clus *Cluster) error
    38  	// Desc returns a description of the failure
    39  	Desc() string
    40  	// TestCase returns "rpcpb.Case" enum type.
    41  	TestCase() rpcpb.Case
    42  }
    43  
    44  type injectMemberFunc func(*Cluster, int) error
    45  type recoverMemberFunc func(*Cluster, int) error
    46  
    47  type caseByFunc struct {
    48  	desc          string
    49  	rpcpbCase     rpcpb.Case
    50  	injectMember  injectMemberFunc
    51  	recoverMember recoverMemberFunc
    52  }
    53  
    54  func (c *caseByFunc) Desc() string {
    55  	if c.desc != "" {
    56  		return c.desc
    57  	}
    58  	return c.rpcpbCase.String()
    59  }
    60  
    61  func (c *caseByFunc) TestCase() rpcpb.Case {
    62  	return c.rpcpbCase
    63  }
    64  
    65  type caseFollower struct {
    66  	caseByFunc
    67  	last int
    68  	lead int
    69  }
    70  
    71  func (c *caseFollower) updateIndex(clus *Cluster) error {
    72  	lead, err := clus.GetLeader()
    73  	if err != nil {
    74  		return err
    75  	}
    76  	c.lead = lead
    77  
    78  	n := len(clus.Members)
    79  	if c.last == -1 { // first run
    80  		c.last = clus.rd % n
    81  		if c.last == c.lead {
    82  			c.last = (c.last + 1) % n
    83  		}
    84  	} else {
    85  		c.last = (c.last + 1) % n
    86  		if c.last == c.lead {
    87  			c.last = (c.last + 1) % n
    88  		}
    89  	}
    90  	return nil
    91  }
    92  
    93  func (c *caseFollower) Inject(clus *Cluster) error {
    94  	if err := c.updateIndex(clus); err != nil {
    95  		return err
    96  	}
    97  	return c.injectMember(clus, c.last)
    98  }
    99  
   100  func (c *caseFollower) Recover(clus *Cluster) error {
   101  	return c.recoverMember(clus, c.last)
   102  }
   103  
   104  func (c *caseFollower) Desc() string {
   105  	if c.desc != "" {
   106  		return c.desc
   107  	}
   108  	return c.rpcpbCase.String()
   109  }
   110  
   111  func (c *caseFollower) TestCase() rpcpb.Case {
   112  	return c.rpcpbCase
   113  }
   114  
   115  type caseLeader struct {
   116  	caseByFunc
   117  	last int
   118  	lead int
   119  }
   120  
   121  func (c *caseLeader) updateIndex(clus *Cluster) error {
   122  	lead, err := clus.GetLeader()
   123  	if err != nil {
   124  		return err
   125  	}
   126  	c.lead = lead
   127  	c.last = lead
   128  	return nil
   129  }
   130  
   131  func (c *caseLeader) Inject(clus *Cluster) error {
   132  	if err := c.updateIndex(clus); err != nil {
   133  		return err
   134  	}
   135  	return c.injectMember(clus, c.last)
   136  }
   137  
   138  func (c *caseLeader) Recover(clus *Cluster) error {
   139  	return c.recoverMember(clus, c.last)
   140  }
   141  
   142  func (c *caseLeader) TestCase() rpcpb.Case {
   143  	return c.rpcpbCase
   144  }
   145  
   146  type caseQuorum struct {
   147  	caseByFunc
   148  	injected map[int]struct{}
   149  }
   150  
   151  func (c *caseQuorum) Inject(clus *Cluster) error {
   152  	c.injected = pickQuorum(len(clus.Members))
   153  	for idx := range c.injected {
   154  		if err := c.injectMember(clus, idx); err != nil {
   155  			return err
   156  		}
   157  	}
   158  	return nil
   159  }
   160  
   161  func (c *caseQuorum) Recover(clus *Cluster) error {
   162  	for idx := range c.injected {
   163  		if err := c.recoverMember(clus, idx); err != nil {
   164  			return err
   165  		}
   166  	}
   167  	return nil
   168  }
   169  
   170  func (c *caseQuorum) Desc() string {
   171  	if c.desc != "" {
   172  		return c.desc
   173  	}
   174  	return c.rpcpbCase.String()
   175  }
   176  
   177  func (c *caseQuorum) TestCase() rpcpb.Case {
   178  	return c.rpcpbCase
   179  }
   180  
   181  func pickQuorum(size int) (picked map[int]struct{}) {
   182  	picked = make(map[int]struct{})
   183  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
   184  	quorum := size/2 + 1
   185  	for len(picked) < quorum {
   186  		idx := r.Intn(size)
   187  		picked[idx] = struct{}{}
   188  	}
   189  	return picked
   190  }
   191  
   192  type caseAll caseByFunc
   193  
   194  func (c *caseAll) Inject(clus *Cluster) error {
   195  	for i := range clus.Members {
   196  		if err := c.injectMember(clus, i); err != nil {
   197  			return err
   198  		}
   199  	}
   200  	return nil
   201  }
   202  
   203  func (c *caseAll) Recover(clus *Cluster) error {
   204  	for i := range clus.Members {
   205  		if err := c.recoverMember(clus, i); err != nil {
   206  			return err
   207  		}
   208  	}
   209  	return nil
   210  }
   211  
   212  func (c *caseAll) Desc() string {
   213  	if c.desc != "" {
   214  		return c.desc
   215  	}
   216  	return c.rpcpbCase.String()
   217  }
   218  
   219  func (c *caseAll) TestCase() rpcpb.Case {
   220  	return c.rpcpbCase
   221  }
   222  
   223  // caseUntilSnapshot injects a failure/test and waits for a snapshot event
   224  type caseUntilSnapshot struct {
   225  	desc      string
   226  	rpcpbCase rpcpb.Case
   227  	Case
   228  }
   229  
   230  // all delay failure cases except the ones failing with latency
   231  // greater than election timeout (trigger leader election and
   232  // cluster keeps operating anyways)
   233  var slowCases = map[rpcpb.Case]bool{
   234  	rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER:                        true,
   235  	rpcpb.Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT:        true,
   236  	rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: true,
   237  	rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER:                              true,
   238  	rpcpb.Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT:              true,
   239  	rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT:       true,
   240  	rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM:                              true,
   241  	rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ALL:                                 true,
   242  }
   243  
   244  func (c *caseUntilSnapshot) Inject(clus *Cluster) error {
   245  	if err := c.Case.Inject(clus); err != nil {
   246  		return err
   247  	}
   248  
   249  	snapshotCount := clus.Members[0].Etcd.SnapshotCount
   250  
   251  	now := time.Now()
   252  	clus.lg.Info(
   253  		"trigger snapshot START",
   254  		zap.String("desc", c.Desc()),
   255  		zap.Int64("etcd-snapshot-count", snapshotCount),
   256  	)
   257  
   258  	// maxRev may fail since failure just injected, retry if failed.
   259  	startRev, err := clus.maxRev()
   260  	for i := 0; i < 10 && startRev == 0; i++ {
   261  		startRev, err = clus.maxRev()
   262  	}
   263  	if startRev == 0 {
   264  		return err
   265  	}
   266  	lastRev := startRev
   267  
   268  	// healthy cluster could accept 1000 req/sec at least.
   269  	// 3x time to trigger snapshot.
   270  	retries := int(snapshotCount) / 1000 * 3
   271  	if v, ok := slowCases[c.TestCase()]; v && ok {
   272  		// slow network takes more retries
   273  		retries *= 5
   274  	}
   275  
   276  	for i := 0; i < retries; i++ {
   277  		lastRev, err = clus.maxRev()
   278  		if lastRev == 0 {
   279  			clus.lg.Info(
   280  				"trigger snapshot RETRY",
   281  				zap.Int("retries", i),
   282  				zap.Int64("etcd-snapshot-count", snapshotCount),
   283  				zap.Int64("start-revision", startRev),
   284  				zap.Error(err),
   285  			)
   286  			time.Sleep(3 * time.Second)
   287  			continue
   288  		}
   289  
   290  		// If the number of proposals committed is bigger than snapshot count,
   291  		// a new snapshot should have been created.
   292  		diff := lastRev - startRev
   293  		if diff > snapshotCount {
   294  			clus.lg.Info(
   295  				"trigger snapshot PASS",
   296  				zap.Int("retries", i),
   297  				zap.String("desc", c.Desc()),
   298  				zap.Int64("committed-entries", diff),
   299  				zap.Int64("etcd-snapshot-count", snapshotCount),
   300  				zap.Int64("start-revision", startRev),
   301  				zap.Int64("last-revision", lastRev),
   302  				zap.Duration("took", time.Since(now)),
   303  			)
   304  			return nil
   305  		}
   306  
   307  		clus.lg.Info(
   308  			"trigger snapshot RETRY",
   309  			zap.Int("retries", i),
   310  			zap.Int64("committed-entries", diff),
   311  			zap.Int64("etcd-snapshot-count", snapshotCount),
   312  			zap.Int64("start-revision", startRev),
   313  			zap.Int64("last-revision", lastRev),
   314  			zap.Duration("took", time.Since(now)),
   315  			zap.Error(err),
   316  		)
   317  		time.Sleep(time.Second)
   318  		if err != nil {
   319  			time.Sleep(2 * time.Second)
   320  		}
   321  	}
   322  
   323  	return fmt.Errorf("cluster too slow: only %d commits in %d retries", lastRev-startRev, retries)
   324  }
   325  
   326  func (c *caseUntilSnapshot) Desc() string {
   327  	if c.desc != "" {
   328  		return c.desc
   329  	}
   330  	if c.rpcpbCase.String() != "" {
   331  		return c.rpcpbCase.String()
   332  	}
   333  	return c.Case.Desc()
   334  }
   335  
   336  func (c *caseUntilSnapshot) TestCase() rpcpb.Case {
   337  	return c.rpcpbCase
   338  }