go.uber.org/yarpc@v1.72.1/yarpctest/stress.go (about)

     1  // Copyright (c) 2022 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package yarpctest
    22  
    23  import (
    24  	"context"
    25  	"math/rand"
    26  	"strconv"
    27  	"time"
    28  
    29  	"go.uber.org/yarpc/api/peer"
    30  	"go.uber.org/yarpc/api/transport"
    31  	"go.uber.org/yarpc/peer/hostport"
    32  )
    33  
    34  // ListStressTest describes the parameters of a stress test for a peer list implementation.
    35  type ListStressTest struct {
    36  	Workers  int
    37  	Duration time.Duration
    38  	Timeout  time.Duration
    39  	// Latency is the minimum latency of an individual call.
    40  	// Higher latencies drive up concurrency per worker.
    41  	Latency time.Duration
    42  	// LowStress disables membership and connection churn, measuring peer
    43  	// selection baseline performance without interference.
    44  	LowStress bool
    45  	New       func(peer.Transport) peer.ChooserList
    46  }
    47  
    48  // Logger is the interface needed by reports to log results.
    49  // The testing.T is an example of a logger.
    50  type Logger interface {
    51  	Logf(format string, args ...interface{})
    52  }
    53  
    54  // Log writes the parameters for a stress test.
    55  func (t ListStressTest) Log(logger Logger) {
    56  	logger.Logf("choosers: %d\n", t.Workers)
    57  	logger.Logf("duration: %s\n", t.Duration)
    58  	logger.Logf("timeout:  %s\n", t.Timeout)
    59  }
    60  
    61  // Run runs a stress test on a peer list.
    62  //
    63  // The stress test creates a fake transport and a vector of fake peers.
    64  // The test concurrently chooses peers from the list with some number of workers
    65  // while simultaneously adding and removing peers from the peer list and
    66  // simulating connection and disconnection with those peers.
    67  func (t ListStressTest) Run(logger Logger) *ListStressTestReport {
    68  	transport := NewFakeTransport()
    69  	list := t.New(transport)
    70  	report := newStressReport(0)
    71  
    72  	s := stressor{
    73  		stop:      make(chan struct{}),
    74  		reports:   make(chan *ListStressTestReport),
    75  		timeout:   t.Timeout,
    76  		latency:   t.Latency,
    77  		transport: transport,
    78  		list:      list,
    79  		logger:    logger,
    80  	}
    81  
    82  	if err := s.list.Start(); err != nil {
    83  		s.logger.Logf("list start error: %s\n", err.Error())
    84  	}
    85  
    86  	var stressors int
    87  	if t.LowStress {
    88  		for i := uint(0); i < numIds; i++ {
    89  			s.transport.SimulateConnect(bitIds[i])
    90  		}
    91  		err := s.list.Update(peer.ListUpdates{
    92  			Additions: idsForBits(allIdsMask),
    93  		})
    94  		if err != nil {
    95  			s.logger.Logf("list update error: %s\n", err.Error())
    96  			report.Errors++
    97  		}
    98  		report.Updates++
    99  	} else {
   100  		go s.stressTransport(s.reports)
   101  		go s.stressList(s.reports)
   102  		stressors = 2
   103  	}
   104  	for i := 0; i < t.Workers; i++ {
   105  		go s.stressChooser(i)
   106  	}
   107  
   108  	time.Sleep(t.Duration)
   109  
   110  	close(s.stop)
   111  
   112  	for i := 0; i < t.Workers+stressors; i++ {
   113  		report.merge(<-s.reports)
   114  	}
   115  
   116  	if err := s.list.Stop(); err != nil {
   117  		s.logger.Logf("list stop error: %s\n", err.Error())
   118  	}
   119  
   120  	return report
   121  }
   122  
   123  // ListStressTestReport catalogs the results of a peer list stress test.
   124  //
   125  // Each worker keeps track of its own statistics then sends them through
   126  // a channel to the test runner.
   127  // This allows each worker to have independent memory for its log reports and
   128  // reduces the need for synchronization across threads, which could interfere
   129  // with the test.
   130  // The reports get merged into a final report.
   131  type ListStressTestReport struct {
   132  	Workers int
   133  	Errors  int
   134  	Choices int
   135  	Updates int
   136  	Min     time.Duration
   137  	Max     time.Duration
   138  	Total   time.Duration
   139  }
   140  
   141  func newStressReport(numWorkers int) *ListStressTestReport {
   142  	return &ListStressTestReport{
   143  		Workers: numWorkers,
   144  		Min:     1000 * time.Second,
   145  	}
   146  }
   147  
   148  // Log writes the vital statistics for a stress test.
   149  func (r *ListStressTestReport) Log(logger Logger) {
   150  	logger.Logf("choices:  %d\n", r.Choices)
   151  	logger.Logf("updates:  %d\n", r.Updates)
   152  	logger.Logf("errors:   %d\n", r.Errors)
   153  	logger.Logf("min:      %s\n", r.Min)
   154  	if r.Choices != 0 {
   155  		logger.Logf("mean:     %s\n", r.Total/time.Duration(r.Choices))
   156  	}
   157  	logger.Logf("max:      %s\n", r.Max)
   158  }
   159  
   160  // add tracks the latency for a choice of a particular peer.
   161  // the idIndex refers to the peer that was selected.
   162  // in a future version of this test, we can use this id index to show which
   163  // peers were favored by a peer list’s strategy over time.
   164  func (r *ListStressTestReport) add(idIndex int, dur time.Duration) {
   165  	r.Choices++
   166  	r.Min = min(r.Min, dur)
   167  	r.Max = max(r.Max, dur)
   168  	r.Total += dur
   169  }
   170  
   171  // merge merges test reports from independent workers.
   172  func (r *ListStressTestReport) merge(s *ListStressTestReport) {
   173  	r.Workers += s.Workers
   174  	r.Errors += s.Errors
   175  	r.Choices += s.Choices
   176  	r.Updates += s.Updates
   177  	r.Min = min(r.Min, s.Min)
   178  	r.Max = max(r.Max, s.Max)
   179  	r.Total += s.Total
   180  }
   181  
   182  // stressor tracks the parameters and state for a single stress test worker.
   183  type stressor struct {
   184  	// stop closed to signal all workers to stop.
   185  	stop chan struct{}
   186  	// reports is the channel to which the final report must be sent to singal
   187  	// that the worker goroutine is done and transfer ownership of the report
   188  	// memory to the test for merging.
   189  	reports   chan *ListStressTestReport
   190  	timeout   time.Duration
   191  	latency   time.Duration
   192  	transport *FakeTransport
   193  	list      peer.ChooserList
   194  	logger    Logger
   195  }
   196  
   197  // stressTransport randomly connects and disconnects each of the 63 known peers.
   198  // These peers may or may not be retained by the peer list at the time the
   199  // connection status changes.
   200  func (s *stressor) stressTransport(reports chan<- *ListStressTestReport) {
   201  	report := newStressReport(0)
   202  	rng := rand.NewSource(0)
   203  
   204  	_ = s.transport.Start()
   205  	defer func() {
   206  		_ = s.transport.Stop()
   207  	}()
   208  
   209  	// Until we receive a signal to stop...
   210  Loop:
   211  	for {
   212  		select {
   213  		case <-s.stop:
   214  			break Loop
   215  		default:
   216  		}
   217  
   218  		// Construt a random bit vector, where each bit signifies whether the
   219  		// peer for that index should be connected or disconnected.
   220  		bits := rng.Int63()
   221  		// A consequence of this is that we may send connected notifications to
   222  		// peers that are already connected, etc.
   223  		// These are valid cases to exercise in a stress test, even if they are
   224  		// not desirable behaviors of a real transport.
   225  		for i := uint(0); i < numIds; i++ {
   226  			bit := (1 << i) & bits
   227  			if bit != 0 {
   228  				s.transport.SimulateConnect(bitIds[i])
   229  			} else {
   230  				s.transport.SimulateDisconnect(bitIds[i])
   231  			}
   232  		}
   233  	}
   234  
   235  	reports <- report
   236  }
   237  
   238  // stressList sends membership changes to a peer list, using a random subset of all 63 peers every time.
   239  // Each change will tend to include half of the peers, tend to remove a quarter
   240  // from the previous round and add a quarter of the peers for the next round.
   241  // As above, we track whether the peer list has each peer using a bit vector,
   242  // so we can easily use bitwise operations for set differences (&^) and all of
   243  // the identifiers are interned up front to avoid allocations.
   244  // This allows us to send peer list updates very quickly.
   245  func (s *stressor) stressList(reports chan<- *ListStressTestReport) {
   246  	report := newStressReport(0)
   247  	rng := rand.NewSource(1)
   248  	var oldBits int64
   249  
   250  	// Until we are asked to stop...
   251  Loop:
   252  	for {
   253  		select {
   254  		case <-s.stop:
   255  			break Loop
   256  		default:
   257  		}
   258  
   259  		// Construct peer list updates by giving every peer a 50/50 chance of
   260  		// being included in each round.
   261  		// Use set difference bitwise operations to construct the lists of
   262  		// identifiers to add and remove from the current and previous bit
   263  		// vectors.
   264  		newBits := rng.Int63()
   265  		additions := idsForBits(newBits &^ oldBits)
   266  		removals := idsForBits(oldBits &^ newBits)
   267  		err := s.list.Update(peer.ListUpdates{
   268  			Additions: additions,
   269  			Removals:  removals,
   270  		})
   271  		if err != nil {
   272  			s.logger.Logf("list update error: %s\n", err.Error())
   273  			report.Errors++
   274  			break Loop
   275  		}
   276  		report.Updates++
   277  		oldBits = newBits
   278  	}
   279  
   280  	// Clean up.
   281  	err := s.list.Update(peer.ListUpdates{
   282  		Removals: idsForBits(oldBits),
   283  	})
   284  	if err != nil {
   285  		s.logger.Logf("final list update error: %s\n", err.Error())
   286  		report.Errors++
   287  	}
   288  
   289  	reports <- report
   290  }
   291  
   292  // stressChooser rapidly
   293  func (s *stressor) stressChooser(i int) {
   294  	rng := rand.NewSource(int64(i))
   295  	report := newStressReport(1)
   296  
   297  	// Until we are asked to stop...
   298  Loop:
   299  	for {
   300  		// We check for the stop signal before choosing instead of after
   301  		// because the continue statement in the error case bypasses the end of
   302  		// the loop to return here and could cause a deadlock if the other
   303  		// stressors exit first.
   304  		select {
   305  		case <-s.stop:
   306  			break Loop
   307  		default:
   308  		}
   309  
   310  		// Request a peer from the peer list.
   311  		// We use a random pre-allocated shard key to exercise the hashring in
   312  		// particular, but this is harmless for all other choosers.
   313  		shardKey := shardKeys[rng.Int63()&shardKeysMask]
   314  		ctx, cancel := context.WithTimeout(context.Background(), s.timeout)
   315  		start := time.Now()
   316  		peer, onFinish, err := s.list.Choose(ctx, &transport.Request{ShardKey: shardKey})
   317  		stop := time.Now()
   318  		if err != nil {
   319  			cancel()
   320  			s.logger.Logf("choose error: %s\n", err.Error())
   321  			report.Errors++
   322  			continue
   323  		}
   324  		// This is a good point for a future version to inject varying load
   325  		// based on the identifier of the peer that was selected, to show how
   326  		// each list behaves in the face of variations in speed of individual
   327  		// instances.
   328  		if s.latency > 0 {
   329  			time.Sleep(s.latency)
   330  		}
   331  		onFinish(nil)
   332  		cancel()
   333  
   334  		// Report the latency and identifier of the selected peer.
   335  		id := peer.Identifier()
   336  		index := idIndexes[id]
   337  		report.add(index, stop.Sub(start))
   338  	}
   339  
   340  	s.reports <- report
   341  }
   342  
   343  // Accessories hereafter.
   344  
   345  const (
   346  	// We use a 64 bit vector for peer identifiers, but only get to use 63 bits
   347  	// since the Go random number generator only offers 63 bits of entropy.
   348  	numIds     = 63
   349  	allIdsMask = 1<<numIds - 1
   350  	// We will use 256 unique shard keys.
   351  	shardKeysWidth = 8
   352  	numShardKeys   = 1 << shardKeysWidth
   353  	shardKeysMask  = numShardKeys - 1
   354  )
   355  
   356  // pre-allocated vectors for identifiers and shard keys.
   357  var (
   358  	// Each identifier is a string: the name of its own index.
   359  	bitIds [numIds]peer.Identifier
   360  	// Reverse lookup.
   361  	idIndexes map[string]int
   362  	shardKeys [numShardKeys]string
   363  )
   364  
   365  func init() {
   366  	idIndexes = make(map[string]int, numIds)
   367  	for i := 0; i < numIds; i++ {
   368  		name := strconv.Itoa(i)
   369  		bitIds[i] = hostport.PeerIdentifier(name)
   370  		idIndexes[name] = i
   371  	}
   372  	for i := 0; i < numShardKeys; i++ {
   373  		shardKeys[i] = strconv.Itoa(i)
   374  	}
   375  }
   376  
   377  func idsForBits(bits int64) []peer.Identifier {
   378  	var ids []peer.Identifier
   379  	for i := uint(0); i < numIds; i++ {
   380  		if (1<<i)&bits != 0 {
   381  			ids = append(ids, bitIds[i])
   382  		}
   383  	}
   384  	return ids
   385  }
   386  
   387  func min(a, b time.Duration) time.Duration {
   388  	if a < b {
   389  		return a
   390  	}
   391  	return b
   392  }
   393  
   394  func max(a, b time.Duration) time.Duration {
   395  	if a > b {
   396  		return a
   397  	}
   398  	return b
   399  }