github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/functional/tester/stresser_lease.go (about)

     1  // Copyright 2018 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tester
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"math/rand"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"github.com/lfch/etcd-io/api/v3/v3rpc/rpctypes"
    26  	"github.com/lfch/etcd-io/client/v3"
    27  	"github.com/lfch/etcd-io/tests/v3/functional/rpcpb"
    28  
    29  	"go.uber.org/zap"
    30  	"golang.org/x/time/rate"
    31  	"google.golang.org/grpc"
    32  )
    33  
    34  const (
    35  	// time to live for lease
    36  	defaultTTL      = 120
    37  	defaultTTLShort = 2
    38  )
    39  
    40  type leaseStresser struct {
    41  	stype rpcpb.StresserType
    42  	lg    *zap.Logger
    43  
    44  	m      *rpcpb.Member
    45  	cli    *clientv3.Client
    46  	ctx    context.Context
    47  	cancel func()
    48  
    49  	rateLimiter *rate.Limiter
    50  	// atomicModifiedKey records the number of keys created and deleted during a test case
    51  	atomicModifiedKey        int64
    52  	numLeases                int
    53  	keysPerLease             int
    54  	aliveLeases              *atomicLeases
    55  	alivedLeasesWithShortTTL *atomicLeases
    56  	revokedLeases            *atomicLeases
    57  	shortLivedLeases         *atomicLeases
    58  
    59  	runWg   sync.WaitGroup
    60  	aliveWg sync.WaitGroup
    61  }
    62  
    63  type atomicLeases struct {
    64  	// rwLock is used to protect read/write access of leases map
    65  	// which are accessed and modified by different goroutines.
    66  	rwLock sync.RWMutex
    67  	leases map[int64]time.Time
    68  }
    69  
    70  func (al *atomicLeases) add(leaseID int64, t time.Time) {
    71  	al.rwLock.Lock()
    72  	al.leases[leaseID] = t
    73  	al.rwLock.Unlock()
    74  }
    75  
    76  func (al *atomicLeases) update(leaseID int64, t time.Time) {
    77  	al.rwLock.Lock()
    78  	_, ok := al.leases[leaseID]
    79  	if ok {
    80  		al.leases[leaseID] = t
    81  	}
    82  	al.rwLock.Unlock()
    83  }
    84  
    85  func (al *atomicLeases) read(leaseID int64) (rv time.Time, ok bool) {
    86  	al.rwLock.RLock()
    87  	rv, ok = al.leases[leaseID]
    88  	al.rwLock.RUnlock()
    89  	return rv, ok
    90  }
    91  
    92  func (al *atomicLeases) remove(leaseID int64) {
    93  	al.rwLock.Lock()
    94  	delete(al.leases, leaseID)
    95  	al.rwLock.Unlock()
    96  }
    97  
    98  func (al *atomicLeases) getLeasesMap() map[int64]time.Time {
    99  	leasesCopy := make(map[int64]time.Time)
   100  	al.rwLock.RLock()
   101  	for k, v := range al.leases {
   102  		leasesCopy[k] = v
   103  	}
   104  	al.rwLock.RUnlock()
   105  	return leasesCopy
   106  }
   107  
   108  func (ls *leaseStresser) setupOnce() error {
   109  	if ls.aliveLeases != nil {
   110  		return nil
   111  	}
   112  	if ls.numLeases == 0 {
   113  		panic("expect numLeases to be set")
   114  	}
   115  	if ls.keysPerLease == 0 {
   116  		panic("expect keysPerLease to be set")
   117  	}
   118  
   119  	ls.aliveLeases = &atomicLeases{leases: make(map[int64]time.Time)}
   120  	return nil
   121  }
   122  
   123  func (ls *leaseStresser) Stress() error {
   124  	ls.lg.Info(
   125  		"stress START",
   126  		zap.String("stress-type", ls.stype.String()),
   127  		zap.String("endpoint", ls.m.EtcdClientEndpoint),
   128  	)
   129  
   130  	if err := ls.setupOnce(); err != nil {
   131  		return err
   132  	}
   133  
   134  	ctx, cancel := context.WithCancel(context.Background())
   135  	ls.ctx = ctx
   136  	ls.cancel = cancel
   137  
   138  	cli, err := ls.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(1 * time.Second))
   139  	if err != nil {
   140  		return fmt.Errorf("%v (%s)", err, ls.m.EtcdClientEndpoint)
   141  	}
   142  	ls.cli = cli
   143  
   144  	ls.revokedLeases = &atomicLeases{leases: make(map[int64]time.Time)}
   145  	ls.shortLivedLeases = &atomicLeases{leases: make(map[int64]time.Time)}
   146  	ls.alivedLeasesWithShortTTL = &atomicLeases{leases: make(map[int64]time.Time)}
   147  
   148  	ls.runWg.Add(1)
   149  	go ls.run()
   150  	return nil
   151  }
   152  
   153  func (ls *leaseStresser) run() {
   154  	defer ls.runWg.Done()
   155  	ls.restartKeepAlives()
   156  	for {
   157  		// the number of keys created and deleted is roughly 2x the number of created keys for an iteration.
   158  		// the rateLimiter therefore consumes 2x ls.numLeases*ls.keysPerLease tokens where each token represents a create/delete operation for key.
   159  		err := ls.rateLimiter.WaitN(ls.ctx, 2*ls.numLeases*ls.keysPerLease)
   160  		if err == context.Canceled {
   161  			return
   162  		}
   163  
   164  		ls.lg.Debug(
   165  			"stress creating leases",
   166  			zap.String("stress-type", ls.stype.String()),
   167  			zap.String("endpoint", ls.m.EtcdClientEndpoint),
   168  		)
   169  		ls.createLeases()
   170  		ls.lg.Debug(
   171  			"stress created leases",
   172  			zap.String("stress-type", ls.stype.String()),
   173  			zap.String("endpoint", ls.m.EtcdClientEndpoint),
   174  		)
   175  
   176  		ls.lg.Debug(
   177  			"stress dropped leases",
   178  			zap.String("stress-type", ls.stype.String()),
   179  			zap.String("endpoint", ls.m.EtcdClientEndpoint),
   180  		)
   181  		ls.randomlyDropLeases()
   182  		ls.lg.Debug(
   183  			"stress dropped leases",
   184  			zap.String("stress-type", ls.stype.String()),
   185  			zap.String("endpoint", ls.m.EtcdClientEndpoint),
   186  		)
   187  	}
   188  }
   189  
   190  func (ls *leaseStresser) restartKeepAlives() {
   191  	for leaseID := range ls.aliveLeases.getLeasesMap() {
   192  		ls.aliveWg.Add(1)
   193  		go func(id int64) {
   194  			ls.keepLeaseAlive(id)
   195  		}(leaseID)
   196  	}
   197  	for leaseID := range ls.alivedLeasesWithShortTTL.getLeasesMap() {
   198  		ls.aliveWg.Add(1)
   199  		go func(id int64) {
   200  			ls.keepLeaseAlive(id)
   201  		}(leaseID)
   202  	}
   203  }
   204  
   205  func (ls *leaseStresser) createLeases() {
   206  	ls.createAliveLeasesWithShortTTL()
   207  	ls.createAliveLeases()
   208  	ls.createShortLivedLeases()
   209  }
   210  
   211  func (ls *leaseStresser) createAliveLeases() {
   212  	neededLeases := ls.numLeases - len(ls.aliveLeases.getLeasesMap())
   213  	var wg sync.WaitGroup
   214  	for i := 0; i < neededLeases; i++ {
   215  		wg.Add(1)
   216  		go func() {
   217  			defer wg.Done()
   218  			leaseID, err := ls.createLeaseWithKeys(defaultTTL)
   219  			if err != nil {
   220  				ls.lg.Debug(
   221  					"createLeaseWithKeys failed",
   222  					zap.String("endpoint", ls.m.EtcdClientEndpoint),
   223  					zap.Error(err),
   224  				)
   225  				return
   226  			}
   227  			ls.aliveLeases.add(leaseID, time.Now())
   228  			// keep track of all the keep lease alive goroutines
   229  			ls.aliveWg.Add(1)
   230  			go ls.keepLeaseAlive(leaseID)
   231  		}()
   232  	}
   233  	wg.Wait()
   234  }
   235  
   236  func (ls *leaseStresser) createAliveLeasesWithShortTTL() {
   237  	neededLeases := 2
   238  	var wg sync.WaitGroup
   239  	for i := 0; i < neededLeases; i++ {
   240  		wg.Add(1)
   241  		go func() {
   242  			defer wg.Done()
   243  			leaseID, err := ls.createLeaseWithKeys(defaultTTLShort)
   244  			if err != nil {
   245  				ls.lg.Debug(
   246  					"createLeaseWithKeys failed",
   247  					zap.String("endpoint", ls.m.EtcdClientEndpoint),
   248  					zap.Error(err),
   249  				)
   250  				return
   251  			}
   252  			ls.lg.Debug("createAliveLeasesWithShortTTL", zap.Int64("lease-id", leaseID))
   253  			ls.alivedLeasesWithShortTTL.add(leaseID, time.Now())
   254  			// keep track of all the keep lease alive goroutines
   255  			ls.aliveWg.Add(1)
   256  			go ls.keepLeaseAlive(leaseID)
   257  		}()
   258  	}
   259  	wg.Wait()
   260  }
   261  
   262  func (ls *leaseStresser) createShortLivedLeases() {
   263  	// one round of createLeases() might not create all the short lived leases we want due to failures.
   264  	// thus, we want to create remaining short lived leases in the future round.
   265  	neededLeases := ls.numLeases - len(ls.shortLivedLeases.getLeasesMap())
   266  	var wg sync.WaitGroup
   267  	for i := 0; i < neededLeases; i++ {
   268  		wg.Add(1)
   269  		go func() {
   270  			defer wg.Done()
   271  			leaseID, err := ls.createLeaseWithKeys(defaultTTLShort)
   272  			if err != nil {
   273  				return
   274  			}
   275  			ls.shortLivedLeases.add(leaseID, time.Now())
   276  		}()
   277  	}
   278  	wg.Wait()
   279  }
   280  
   281  func (ls *leaseStresser) createLeaseWithKeys(ttl int64) (int64, error) {
   282  	leaseID, err := ls.createLease(ttl)
   283  	if err != nil {
   284  		ls.lg.Debug(
   285  			"createLease failed",
   286  			zap.String("stress-type", ls.stype.String()),
   287  			zap.String("endpoint", ls.m.EtcdClientEndpoint),
   288  			zap.Error(err),
   289  		)
   290  		return -1, err
   291  	}
   292  
   293  	ls.lg.Debug(
   294  		"createLease created lease",
   295  		zap.String("stress-type", ls.stype.String()),
   296  		zap.String("endpoint", ls.m.EtcdClientEndpoint),
   297  		zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   298  	)
   299  	if err := ls.attachKeysWithLease(leaseID); err != nil {
   300  		return -1, err
   301  	}
   302  	return leaseID, nil
   303  }
   304  
   305  func (ls *leaseStresser) randomlyDropLeases() {
   306  	var wg sync.WaitGroup
   307  	for l := range ls.aliveLeases.getLeasesMap() {
   308  		wg.Add(1)
   309  		go func(leaseID int64) {
   310  			defer wg.Done()
   311  			dropped, err := ls.randomlyDropLease(leaseID)
   312  			// if randomlyDropLease encountered an error such as context is cancelled, remove the lease from aliveLeases
   313  			// because we can't tell whether the lease is dropped or not.
   314  			if err != nil {
   315  				ls.lg.Debug(
   316  					"randomlyDropLease failed",
   317  					zap.String("endpoint", ls.m.EtcdClientEndpoint),
   318  					zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   319  					zap.Error(err),
   320  				)
   321  				ls.aliveLeases.remove(leaseID)
   322  				return
   323  			}
   324  			if !dropped {
   325  				return
   326  			}
   327  			ls.lg.Debug(
   328  				"randomlyDropLease dropped a lease",
   329  				zap.String("stress-type", ls.stype.String()),
   330  				zap.String("endpoint", ls.m.EtcdClientEndpoint),
   331  				zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   332  			)
   333  			ls.revokedLeases.add(leaseID, time.Now())
   334  			ls.aliveLeases.remove(leaseID)
   335  		}(l)
   336  	}
   337  	wg.Wait()
   338  }
   339  
   340  func (ls *leaseStresser) createLease(ttl int64) (int64, error) {
   341  	resp, err := ls.cli.Grant(ls.ctx, ttl)
   342  	if err != nil {
   343  		return -1, err
   344  	}
   345  	return int64(resp.ID), nil
   346  }
   347  
   348  func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
   349  	defer ls.aliveWg.Done()
   350  	ctx, cancel := context.WithCancel(ls.ctx)
   351  	stream, err := ls.cli.KeepAlive(ctx, clientv3.LeaseID(leaseID))
   352  	defer func() { cancel() }()
   353  	for {
   354  		select {
   355  		case <-time.After(500 * time.Millisecond):
   356  		case <-ls.ctx.Done():
   357  			ls.lg.Debug(
   358  				"keepLeaseAlive context canceled",
   359  				zap.String("stress-type", ls.stype.String()),
   360  				zap.String("endpoint", ls.m.EtcdClientEndpoint),
   361  				zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   362  				zap.Error(ls.ctx.Err()),
   363  			)
   364  			// it is  possible that lease expires at invariant checking phase but not at keepLeaseAlive() phase.
   365  			// this scenario is possible when alive lease is just about to expire when keepLeaseAlive() exists and expires at invariant checking phase.
   366  			// to circumvent that scenario, we check each lease before keepalive loop exist to see if it has been renewed in last TTL/2 duration.
   367  			// if it is renewed, this means that invariant checking have at least ttl/2 time before lease expires which is long enough for the checking to finish.
   368  			// if it is not renewed, we remove the lease from the alive map so that the lease doesn't expire during invariant checking
   369  			renewTime, ok := ls.aliveLeases.read(leaseID)
   370  			if ok && renewTime.Add(defaultTTL/2*time.Second).Before(time.Now()) {
   371  				ls.aliveLeases.remove(leaseID)
   372  				ls.lg.Debug(
   373  					"keepLeaseAlive lease has not been renewed, dropped it",
   374  					zap.String("stress-type", ls.stype.String()),
   375  					zap.String("endpoint", ls.m.EtcdClientEndpoint),
   376  					zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   377  				)
   378  			}
   379  			return
   380  		}
   381  
   382  		if err != nil {
   383  			ls.lg.Debug(
   384  				"keepLeaseAlive lease creates stream error",
   385  				zap.String("stress-type", ls.stype.String()),
   386  				zap.String("endpoint", ls.m.EtcdClientEndpoint),
   387  				zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   388  				zap.Error(err),
   389  			)
   390  			cancel()
   391  			ctx, cancel = context.WithCancel(ls.ctx)
   392  			stream, err = ls.cli.KeepAlive(ctx, clientv3.LeaseID(leaseID))
   393  			cancel()
   394  			continue
   395  		}
   396  		if err != nil {
   397  			ls.lg.Debug(
   398  				"keepLeaseAlive failed to receive lease keepalive response",
   399  				zap.String("stress-type", ls.stype.String()),
   400  				zap.String("endpoint", ls.m.EtcdClientEndpoint),
   401  				zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   402  				zap.Error(err),
   403  			)
   404  			continue
   405  		}
   406  
   407  		ls.lg.Debug(
   408  			"keepLeaseAlive waiting on lease stream",
   409  			zap.String("stress-type", ls.stype.String()),
   410  			zap.String("endpoint", ls.m.EtcdClientEndpoint),
   411  			zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   412  		)
   413  		leaseRenewTime := time.Now()
   414  		respRC := <-stream
   415  		if respRC == nil {
   416  			ls.lg.Debug(
   417  				"keepLeaseAlive received nil lease keepalive response",
   418  				zap.String("stress-type", ls.stype.String()),
   419  				zap.String("endpoint", ls.m.EtcdClientEndpoint),
   420  				zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   421  			)
   422  			continue
   423  		}
   424  
   425  		// lease expires after TTL become 0
   426  		// don't send keepalive if the lease has expired
   427  		if respRC.TTL <= 0 {
   428  			ls.lg.Debug(
   429  				"keepLeaseAlive stream received lease keepalive response TTL <= 0",
   430  				zap.String("stress-type", ls.stype.String()),
   431  				zap.String("endpoint", ls.m.EtcdClientEndpoint),
   432  				zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   433  				zap.Int64("ttl", respRC.TTL),
   434  			)
   435  			ls.aliveLeases.remove(leaseID)
   436  			return
   437  		}
   438  		// renew lease timestamp only if lease is present
   439  		ls.lg.Debug(
   440  			"keepLeaseAlive renewed a lease",
   441  			zap.String("stress-type", ls.stype.String()),
   442  			zap.String("endpoint", ls.m.EtcdClientEndpoint),
   443  			zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   444  		)
   445  		ls.aliveLeases.update(leaseID, leaseRenewTime)
   446  	}
   447  }
   448  
   449  // attachKeysWithLease function attaches keys to the lease.
   450  // the format of key is the concat of leaseID + '_' + '<order of key creation>'
   451  // e.g 5186835655248304152_0 for first created key and 5186835655248304152_1 for second created key
   452  func (ls *leaseStresser) attachKeysWithLease(leaseID int64) error {
   453  	var txnPuts []clientv3.Op
   454  	for j := 0; j < ls.keysPerLease; j++ {
   455  		txnput := clientv3.OpPut(
   456  			fmt.Sprintf("%d%s%d", leaseID, "_", j),
   457  			fmt.Sprintf("bar"),
   458  			clientv3.WithLease(clientv3.LeaseID(leaseID)),
   459  		)
   460  		txnPuts = append(txnPuts, txnput)
   461  	}
   462  	// keep retrying until lease is not found or ctx is being canceled
   463  	for ls.ctx.Err() == nil {
   464  		_, err := ls.cli.Txn(ls.ctx).Then(txnPuts...).Commit()
   465  		if err == nil {
   466  			// since all created keys will be deleted too, the number of operations on keys will be roughly 2x the number of created keys
   467  			atomic.AddInt64(&ls.atomicModifiedKey, 2*int64(ls.keysPerLease))
   468  			return nil
   469  		}
   470  		if rpctypes.Error(err) == rpctypes.ErrLeaseNotFound {
   471  			return err
   472  		}
   473  	}
   474  	return ls.ctx.Err()
   475  }
   476  
   477  // randomlyDropLease drops the lease only when the rand.Int(2) returns 1.
   478  // This creates a 50/50 percents chance of dropping a lease
   479  func (ls *leaseStresser) randomlyDropLease(leaseID int64) (bool, error) {
   480  	if rand.Intn(2) != 0 {
   481  		return false, nil
   482  	}
   483  
   484  	// keep retrying until a lease is dropped or ctx is being canceled
   485  	for ls.ctx.Err() == nil {
   486  		_, err := ls.cli.Revoke(ls.ctx, clientv3.LeaseID(leaseID))
   487  		if err == nil || rpctypes.Error(err) == rpctypes.ErrLeaseNotFound {
   488  			return true, nil
   489  		}
   490  	}
   491  
   492  	ls.lg.Debug(
   493  		"randomlyDropLease error",
   494  		zap.String("stress-type", ls.stype.String()),
   495  		zap.String("endpoint", ls.m.EtcdClientEndpoint),
   496  		zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
   497  		zap.Error(ls.ctx.Err()),
   498  	)
   499  	return false, ls.ctx.Err()
   500  }
   501  
   502  func (ls *leaseStresser) Pause() map[string]int {
   503  	return ls.Close()
   504  }
   505  
   506  func (ls *leaseStresser) Close() map[string]int {
   507  	ls.cancel()
   508  	ls.runWg.Wait()
   509  	ls.aliveWg.Wait()
   510  	ls.cli.Close()
   511  	ls.lg.Info(
   512  		"stress STOP",
   513  		zap.String("stress-type", ls.stype.String()),
   514  		zap.String("endpoint", ls.m.EtcdClientEndpoint),
   515  	)
   516  	return nil
   517  }
   518  
   519  func (ls *leaseStresser) ModifiedKeys() int64 {
   520  	return atomic.LoadInt64(&ls.atomicModifiedKey)
   521  }