get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/server/jetstream_chaos_test.go (about)

     1  // Copyright 2023 The NATS Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  //go:build js_chaos_tests
    15  // +build js_chaos_tests
    16  
    17  package server
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"fmt"
    23  	"math/rand"
    24  	"strings"
    25  	"sync"
    26  	"testing"
    27  	"time"
    28  
    29  	"github.com/nats-io/nats.go"
    30  )
    31  
    32  // Support functions for "chaos" testing (random injected failures)
    33  
    34  type ChaosMonkeyController interface {
    35  	// Launch the monkey as background routine and return
    36  	start()
    37  	// Stop a monkey that was previously started
    38  	stop()
    39  	// Run the monkey synchronously, until it is manually stopped via stopCh
    40  	run()
    41  }
    42  
    43  type ClusterChaosMonkey interface {
    44  	// Set defaults and validates the monkey parameters
    45  	validate(t *testing.T, c *cluster)
    46  	// Run the monkey synchronously, until it is manually stopped via stopCh
    47  	run(t *testing.T, c *cluster, stopCh <-chan bool)
    48  }
    49  
    50  // Chaos Monkey Controller that acts on a cluster
    51  type clusterChaosMonkeyController struct {
    52  	t       *testing.T
    53  	cluster *cluster
    54  	wg      sync.WaitGroup
    55  	stopCh  chan bool
    56  	ccm     ClusterChaosMonkey
    57  }
    58  
    59  func createClusterChaosMonkeyController(t *testing.T, c *cluster, ccm ClusterChaosMonkey) ChaosMonkeyController {
    60  	ccm.validate(t, c)
    61  	return &clusterChaosMonkeyController{
    62  		t:       t,
    63  		cluster: c,
    64  		stopCh:  make(chan bool, 3),
    65  		ccm:     ccm,
    66  	}
    67  }
    68  
    69  func (m *clusterChaosMonkeyController) start() {
    70  	m.t.Logf("🐵 Starting monkey")
    71  	m.wg.Add(1)
    72  	go func() {
    73  		defer m.wg.Done()
    74  		m.run()
    75  	}()
    76  }
    77  
    78  func (m *clusterChaosMonkeyController) stop() {
    79  	m.t.Logf("🐵 Stopping monkey")
    80  	m.stopCh <- true
    81  	m.wg.Wait()
    82  	m.t.Logf("🐵 Monkey stopped")
    83  }
    84  
    85  func (m *clusterChaosMonkeyController) run() {
    86  	m.ccm.run(m.t, m.cluster, m.stopCh)
    87  }
    88  
    89  // Cluster Chaos Monkey that selects a random subset of the nodes in a cluster (according to min/max provided),
    90  // shuts them down for a given duration (according to min/max provided), then brings them back up.
    91  // Then sleeps for a given time, and does it again until stopped.
    92  type clusterBouncerChaosMonkey struct {
    93  	minDowntime    time.Duration
    94  	maxDowntime    time.Duration
    95  	minDownServers int
    96  	maxDownServers int
    97  	pause          time.Duration
    98  }
    99  
   100  func (m *clusterBouncerChaosMonkey) validate(t *testing.T, c *cluster) {
   101  	if m.minDowntime > m.maxDowntime {
   102  		t.Fatalf("Min downtime %v cannot be larger than max downtime %v", m.minDowntime, m.maxDowntime)
   103  	}
   104  
   105  	if m.minDownServers > m.maxDownServers {
   106  		t.Fatalf("Min down servers %v cannot be larger than max down servers %v", m.minDownServers, m.maxDownServers)
   107  	}
   108  }
   109  
   110  func (m *clusterBouncerChaosMonkey) run(t *testing.T, c *cluster, stopCh <-chan bool) {
   111  	for {
   112  		// Pause between actions
   113  		select {
   114  		case <-stopCh:
   115  			return
   116  		case <-time.After(m.pause):
   117  		}
   118  
   119  		// Pick a random subset of servers
   120  		numServersDown := rand.Intn(1+m.maxDownServers-m.minDownServers) + m.minDownServers
   121  		servers := c.selectRandomServers(numServersDown)
   122  		serverNames := []string{}
   123  		for _, s := range servers {
   124  			serverNames = append(serverNames, s.info.Name)
   125  		}
   126  
   127  		// Pick a random outage interval
   128  		minOutageNanos := m.minDowntime.Nanoseconds()
   129  		maxOutageNanos := m.maxDowntime.Nanoseconds()
   130  		outageDurationNanos := rand.Int63n(1+maxOutageNanos-minOutageNanos) + minOutageNanos
   131  		outageDuration := time.Duration(outageDurationNanos)
   132  
   133  		// Take down selected servers
   134  		t.Logf("🐵 Taking down %d/%d servers for %v (%v)", numServersDown, len(c.servers), outageDuration, serverNames)
   135  		c.stopSubset(servers)
   136  
   137  		// Wait for the "outage" duration
   138  		select {
   139  		case <-stopCh:
   140  			return
   141  		case <-time.After(outageDuration):
   142  		}
   143  
   144  		// Restart servers and wait for cluster to be healthy
   145  		t.Logf("🐵 Restoring cluster")
   146  		c.restartAllSamePorts()
   147  		c.waitOnClusterHealthz()
   148  
   149  		c.waitOnClusterReady()
   150  		c.waitOnAllCurrent()
   151  		c.waitOnLeader()
   152  	}
   153  }
   154  
   155  // Additional cluster methods for chaos testing
   156  
   157  func (c *cluster) waitOnClusterHealthz() {
   158  	c.t.Helper()
   159  	for _, cs := range c.servers {
   160  		c.waitOnServerHealthz(cs)
   161  	}
   162  }
   163  
   164  func (c *cluster) stopSubset(toStop []*Server) {
   165  	c.t.Helper()
   166  	for _, s := range toStop {
   167  		s.Shutdown()
   168  	}
   169  }
   170  
   171  func (c *cluster) selectRandomServers(numServers int) []*Server {
   172  	c.t.Helper()
   173  	if numServers > len(c.servers) {
   174  		panic(fmt.Sprintf("Can't select %d servers in a cluster of %d", numServers, len(c.servers)))
   175  	}
   176  	var selectedServers []*Server
   177  	selectedServers = append(selectedServers, c.servers...)
   178  	rand.Shuffle(len(selectedServers), func(x, y int) {
   179  		selectedServers[x], selectedServers[y] = selectedServers[y], selectedServers[x]
   180  	})
   181  	return selectedServers[0:numServers]
   182  }
   183  
   184  // Other helpers
   185  
   186  func jsClientConnectCluster(t testing.TB, c *cluster) (*nats.Conn, nats.JetStreamContext) {
   187  	serverConnectURLs := make([]string, len(c.servers))
   188  	for i, server := range c.servers {
   189  		serverConnectURLs[i] = server.ClientURL()
   190  	}
   191  	connectURL := strings.Join(serverConnectURLs, ",")
   192  
   193  	nc, err := nats.Connect(connectURL)
   194  	if err != nil {
   195  		t.Fatalf("Failed to connect: %s", err)
   196  	}
   197  
   198  	js, err := nc.JetStream()
   199  	if err != nil {
   200  		t.Fatalf("Failed to init JetStream context: %s", err)
   201  	}
   202  
   203  	return nc, js
   204  }
   205  
   206  func toIndentedJsonString(v any) string {
   207  	s, err := json.MarshalIndent(v, "", "  ")
   208  	if err != nil {
   209  		panic(err)
   210  	}
   211  	return string(s)
   212  }
   213  
   214  // Bounces the entire set of nodes, then brings them back up.
   215  // Fail if some nodes don't come back online.
   216  func TestJetStreamChaosClusterBounce(t *testing.T) {
   217  
   218  	const duration = 60 * time.Second
   219  	const clusterSize = 3
   220  
   221  	c := createJetStreamClusterExplicit(t, "R3", clusterSize)
   222  	defer c.shutdown()
   223  
   224  	chaos := createClusterChaosMonkeyController(
   225  		t,
   226  		c,
   227  		&clusterBouncerChaosMonkey{
   228  			minDowntime:    0 * time.Second,
   229  			maxDowntime:    2 * time.Second,
   230  			minDownServers: clusterSize,
   231  			maxDownServers: clusterSize,
   232  			pause:          3 * time.Second,
   233  		},
   234  	)
   235  	chaos.start()
   236  	defer chaos.stop()
   237  
   238  	<-time.After(duration)
   239  }
   240  
   241  // Bounces a subset of the nodes, then brings them back up.
   242  // Fails if some nodes don't come back online.
   243  func TestJetStreamChaosClusterBounceSubset(t *testing.T) {
   244  
   245  	const duration = 60 * time.Second
   246  	const clusterSize = 3
   247  
   248  	c := createJetStreamClusterExplicit(t, "R3", clusterSize)
   249  	defer c.shutdown()
   250  
   251  	chaos := createClusterChaosMonkeyController(
   252  		t,
   253  		c,
   254  		&clusterBouncerChaosMonkey{
   255  			minDowntime:    0 * time.Second,
   256  			maxDowntime:    2 * time.Second,
   257  			minDownServers: 1,
   258  			maxDownServers: clusterSize,
   259  			pause:          3 * time.Second,
   260  		},
   261  	)
   262  	chaos.start()
   263  	defer chaos.stop()
   264  
   265  	<-time.After(duration)
   266  }
   267  
   268  const (
   269  	chaosConsumerTestsClusterName = "CONSUMERS_CHAOS_TEST"
   270  	chaosConsumerTestsStreamName  = "CONSUMER_CHAOS_TEST_STREAM"
   271  	chaosConsumerTestsSubject     = "foo"
   272  	chaosConsumerTestsDebug       = false
   273  )
   274  
   275  // Creates stream and fills it with the given number of messages.
   276  // Each message is the string representation of the stream sequence number,
   277  // e.g. the first message (seqno: 1) contains data "1".
   278  // This allows consumers to verify the content of each message without tracking additional state
   279  func createStreamForConsumerChaosTest(t *testing.T, c *cluster, replicas, numMessages int) {
   280  	t.Helper()
   281  
   282  	const publishBatchSize = 1_000
   283  
   284  	pubNc, pubJs := jsClientConnectCluster(t, c)
   285  	defer pubNc.Close()
   286  
   287  	_, err := pubJs.AddStream(&nats.StreamConfig{
   288  		Name:     chaosConsumerTestsStreamName,
   289  		Subjects: []string{chaosConsumerTestsSubject},
   290  		Replicas: replicas,
   291  	})
   292  	if err != nil {
   293  		t.Fatalf("Error creating stream: %v", err)
   294  	}
   295  
   296  	ackFutures := make([]nats.PubAckFuture, 0, publishBatchSize)
   297  
   298  	for i := 1; i <= numMessages; i++ {
   299  		message := []byte(fmt.Sprintf("%d", i))
   300  		pubAckFuture, err := pubJs.PublishAsync(chaosConsumerTestsSubject, message, nats.ExpectLastSequence(uint64(i-1)))
   301  		if err != nil {
   302  			t.Fatalf("Publish error: %s", err)
   303  		}
   304  		ackFutures = append(ackFutures, pubAckFuture)
   305  
   306  		if (i > 0 && i%publishBatchSize == 0) || i == numMessages {
   307  			select {
   308  			case <-pubJs.PublishAsyncComplete():
   309  				for _, pubAckFuture := range ackFutures {
   310  					select {
   311  					case <-pubAckFuture.Ok():
   312  						// Noop
   313  					case pubAckErr := <-pubAckFuture.Err():
   314  						t.Fatalf("Error publishing: %s", pubAckErr)
   315  					case <-time.After(30 * time.Second):
   316  						t.Fatalf("Timeout verifying pubAck for message: %s", pubAckFuture.Msg().Data)
   317  					}
   318  				}
   319  				ackFutures = make([]nats.PubAckFuture, 0, publishBatchSize)
   320  				t.Logf("Published %d/%d messages", i, numMessages)
   321  
   322  			case <-time.After(30 * time.Second):
   323  				t.Fatalf("Publish timed out")
   324  			}
   325  		}
   326  	}
   327  }
   328  
   329  // Verify ordered delivery despite cluster-wide outages
   330  func TestJetStreamChaosConsumerOrdered(t *testing.T) {
   331  
   332  	const numMessages = 5_000
   333  	const numBatch = 500
   334  	const maxRetries = 100
   335  	const retryDelay = 500 * time.Millisecond
   336  	const fetchTimeout = 250 * time.Millisecond
   337  	const clusterSize = 3
   338  	const replicas = 3
   339  
   340  	c := createJetStreamClusterExplicit(t, chaosConsumerTestsClusterName, clusterSize)
   341  	defer c.shutdown()
   342  
   343  	createStreamForConsumerChaosTest(t, c, replicas, numMessages)
   344  
   345  	chaos := createClusterChaosMonkeyController(
   346  		t,
   347  		c,
   348  		&clusterBouncerChaosMonkey{
   349  			minDowntime:    0 * time.Second,
   350  			maxDowntime:    2 * time.Second,
   351  			minDownServers: clusterSize, // Whole cluster outage
   352  			maxDownServers: clusterSize,
   353  			pause:          1 * time.Second,
   354  		},
   355  	)
   356  
   357  	subNc, subJs := jsClientConnectCluster(t, c)
   358  	defer subNc.Close()
   359  
   360  	sub, err := subJs.SubscribeSync(
   361  		chaosConsumerTestsSubject,
   362  		nats.OrderedConsumer(),
   363  	)
   364  	if err != nil {
   365  		t.Fatalf("Unexpected error: %v", err)
   366  	}
   367  	defer sub.Unsubscribe()
   368  
   369  	if chaosConsumerTestsDebug {
   370  		t.Logf("Initial subscription: %s", toIndentedJsonString(sub))
   371  	}
   372  
   373  	chaos.start()
   374  	defer chaos.stop()
   375  
   376  	for i := 1; i <= numMessages; i++ {
   377  		var msg *nats.Msg
   378  		var nextMsgErr error
   379  		var expectedMsgData = []byte(fmt.Sprintf("%d", i))
   380  
   381  	nextMsgRetryLoop:
   382  		for r := 0; r <= maxRetries; r++ {
   383  			msg, nextMsgErr = sub.NextMsg(fetchTimeout)
   384  			if nextMsgErr == nil {
   385  				break nextMsgRetryLoop
   386  			} else if r == maxRetries {
   387  				t.Fatalf("Exceeded max retries for NextMsg")
   388  			} else if nextMsgErr == nats.ErrBadSubscription {
   389  				t.Fatalf("Subscription is invalid: %s", toIndentedJsonString(sub))
   390  			} else {
   391  				time.Sleep(retryDelay)
   392  			}
   393  		}
   394  
   395  		metadata, err := msg.Metadata()
   396  		if err != nil {
   397  			t.Fatalf("Failed to get message metadata: %v", err)
   398  		}
   399  
   400  		if metadata.Sequence.Stream != uint64(i) {
   401  			t.Fatalf("Expecting stream sequence %d, got %d instead", i, metadata.Sequence.Stream)
   402  		}
   403  
   404  		if !bytes.Equal(msg.Data, expectedMsgData) {
   405  			t.Fatalf("Expecting message %s, got %s instead", expectedMsgData, msg.Data)
   406  		}
   407  
   408  		// Simulate application processing (and gives the monkey some time to brew chaos)
   409  		time.Sleep(10 * time.Millisecond)
   410  
   411  		if i%numBatch == 0 {
   412  			t.Logf("Consumed %d/%d", i, numMessages)
   413  		}
   414  	}
   415  }
   416  
   417  // Verify ordered delivery despite cluster-wide outages
   418  func TestJetStreamChaosConsumerAsync(t *testing.T) {
   419  
   420  	const numMessages = 5_000
   421  	const numBatch = 500
   422  	const timeout = 30 * time.Second // No (new) messages for 30s => terminate
   423  	const maxRetries = 25
   424  	const retryDelay = 500 * time.Millisecond
   425  	const clusterSize = 3
   426  	const replicas = 3
   427  
   428  	c := createJetStreamClusterExplicit(t, chaosConsumerTestsClusterName, clusterSize)
   429  	defer c.shutdown()
   430  
   431  	createStreamForConsumerChaosTest(t, c, replicas, numMessages)
   432  
   433  	chaos := createClusterChaosMonkeyController(
   434  		t,
   435  		c,
   436  		&clusterBouncerChaosMonkey{
   437  			minDowntime:    0 * time.Second,
   438  			maxDowntime:    2 * time.Second,
   439  			minDownServers: clusterSize,
   440  			maxDownServers: clusterSize,
   441  			pause:          2 * time.Second,
   442  		},
   443  	)
   444  
   445  	subNc, subJs := jsClientConnectCluster(t, c)
   446  	defer subNc.Close()
   447  
   448  	timeoutTimer := time.NewTimer(timeout)
   449  	deliveryCount := uint64(0)
   450  	received := NewBitset(numMessages)
   451  
   452  	handleMsg := func(msg *nats.Msg) {
   453  		deliveryCount += 1
   454  
   455  		metadata, err := msg.Metadata()
   456  		if err != nil {
   457  			t.Fatalf("Failed to get message metadata: %v", err)
   458  		}
   459  		seq := metadata.Sequence.Stream
   460  
   461  		var expectedMsgData = []byte(fmt.Sprintf("%d", seq))
   462  		if !bytes.Equal(msg.Data, expectedMsgData) {
   463  			t.Fatalf("Expecting message content '%s', got '%s' instead", expectedMsgData, msg.Data)
   464  		}
   465  
   466  		isDupe := received.get(seq - 1)
   467  
   468  		if isDupe {
   469  			if chaosConsumerTestsDebug {
   470  				t.Logf("Duplicate message delivery, seq: %d", seq)
   471  			}
   472  			return
   473  		}
   474  
   475  		// Mark this sequence as received
   476  		received.set(seq-1, true)
   477  		if received.count() < numMessages {
   478  			// Reset timeout
   479  			timeoutTimer.Reset(timeout)
   480  		} else {
   481  			// All received, speed up the shutdown
   482  			timeoutTimer.Reset(1 * time.Second)
   483  		}
   484  
   485  		if received.count()%numBatch == 0 {
   486  			t.Logf("Consumed %d/%d", received.count(), numMessages)
   487  		}
   488  
   489  		// Simulate application processing (and gives the monkey some time to brew chaos)
   490  		time.Sleep(10 * time.Millisecond)
   491  
   492  	ackRetryLoop:
   493  		for i := 0; i <= maxRetries; i++ {
   494  			ackErr := msg.Ack()
   495  			if ackErr == nil {
   496  				break ackRetryLoop
   497  			} else if i == maxRetries {
   498  				t.Fatalf("Failed to ACK message %d (retried %d times)", seq, maxRetries)
   499  			} else {
   500  				time.Sleep(retryDelay)
   501  			}
   502  		}
   503  	}
   504  
   505  	subOpts := []nats.SubOpt{}
   506  	sub, err := subJs.Subscribe(chaosConsumerTestsSubject, handleMsg, subOpts...)
   507  	if err != nil {
   508  		t.Fatalf("Unexpected error: %v", err)
   509  	}
   510  	defer sub.Unsubscribe()
   511  
   512  	chaos.start()
   513  	defer chaos.stop()
   514  
   515  	// Wait for long enough silence.
   516  	// Either a stall, or all messages received
   517  	<-timeoutTimer.C
   518  
   519  	// Shut down consumer
   520  	sub.Unsubscribe()
   521  
   522  	uniqueDeliveredCount := received.count()
   523  
   524  	t.Logf(
   525  		"Delivered %d/%d messages %d duplicate deliveries",
   526  		uniqueDeliveredCount,
   527  		numMessages,
   528  		deliveryCount-uniqueDeliveredCount,
   529  	)
   530  
   531  	if uniqueDeliveredCount != numMessages {
   532  		t.Fatalf("No new message delivered in the last %s, %d/%d messages never delivered", timeout, numMessages-uniqueDeliveredCount, numMessages)
   533  	}
   534  }
   535  
   536  // Verify durable consumer retains state despite cluster-wide outages
   537  // The consumer connection is also periodically closed, and the consumer 'resumes' on a different one
   538  func TestJetStreamChaosConsumerDurable(t *testing.T) {
   539  
   540  	const numMessages = 5_000
   541  	const numBatch = 500
   542  	const timeout = 30 * time.Second // No (new) messages for 60s => terminate
   543  	const clusterSize = 3
   544  	const replicas = 3
   545  	const maxRetries = 25
   546  	const retryDelay = 500 * time.Millisecond
   547  	const durableConsumerName = "durable"
   548  
   549  	c := createJetStreamClusterExplicit(t, chaosConsumerTestsClusterName, clusterSize)
   550  	defer c.shutdown()
   551  
   552  	createStreamForConsumerChaosTest(t, c, replicas, numMessages)
   553  
   554  	chaos := createClusterChaosMonkeyController(
   555  		t,
   556  		c,
   557  		&clusterBouncerChaosMonkey{
   558  			minDowntime:    0 * time.Second,
   559  			maxDowntime:    2 * time.Second,
   560  			minDownServers: 1,
   561  			maxDownServers: clusterSize,
   562  			pause:          3 * time.Second,
   563  		},
   564  	)
   565  
   566  	var nc *nats.Conn
   567  	var sub *nats.Subscription
   568  	var subLock sync.Mutex
   569  
   570  	var handleMsgFun func(msg *nats.Msg)
   571  	var natsURL string
   572  
   573  	{
   574  		var sb strings.Builder
   575  		for _, s := range c.servers {
   576  			sb.WriteString(s.ClientURL())
   577  			sb.WriteString(",")
   578  		}
   579  		natsURL = sb.String()
   580  	}
   581  
   582  	resetDurableConsumer := func() {
   583  		subLock.Lock()
   584  		defer subLock.Unlock()
   585  
   586  		if nc != nil {
   587  			nc.Close()
   588  		}
   589  
   590  		var newNc *nats.Conn
   591  	connectRetryLoop:
   592  		for r := 0; r <= maxRetries; r++ {
   593  			var connErr error
   594  			newNc, connErr = nats.Connect(natsURL)
   595  			if connErr == nil {
   596  				break connectRetryLoop
   597  			} else if r == maxRetries {
   598  				t.Fatalf("Failed to connect, exceeded max retries, last error: %s", connErr)
   599  			} else {
   600  				time.Sleep(retryDelay)
   601  			}
   602  		}
   603  
   604  		var newJs nats.JetStreamContext
   605  	jsRetryLoop:
   606  		for r := 0; r <= maxRetries; r++ {
   607  			var jsErr error
   608  			newJs, jsErr = newNc.JetStream(nats.MaxWait(10 * time.Second))
   609  			if jsErr == nil {
   610  				break jsRetryLoop
   611  			} else if r == maxRetries {
   612  				t.Fatalf("Failed to get JS, exceeded max retries, last error: %s", jsErr)
   613  			} else {
   614  				time.Sleep(retryDelay)
   615  			}
   616  		}
   617  
   618  		subOpts := []nats.SubOpt{
   619  			nats.Durable(durableConsumerName),
   620  		}
   621  
   622  		var newSub *nats.Subscription
   623  	subscribeRetryLoop:
   624  		for i := 0; i <= maxRetries; i++ {
   625  			var subErr error
   626  			newSub, subErr = newJs.Subscribe(chaosConsumerTestsSubject, handleMsgFun, subOpts...)
   627  			if subErr == nil {
   628  				ci, err := newJs.ConsumerInfo(chaosConsumerTestsStreamName, durableConsumerName)
   629  				if err == nil {
   630  					if chaosConsumerTestsDebug {
   631  						t.Logf("Consumer info:\n %s", toIndentedJsonString(ci))
   632  					}
   633  				} else {
   634  					t.Logf("Failed to retrieve consumer info: %s", err)
   635  				}
   636  
   637  				break subscribeRetryLoop
   638  			} else if i == maxRetries {
   639  				t.Fatalf("Exceeded max retries creating subscription: %v", subErr)
   640  			} else {
   641  				time.Sleep(retryDelay)
   642  			}
   643  		}
   644  
   645  		nc, sub = newNc, newSub
   646  	}
   647  
   648  	timeoutTimer := time.NewTimer(timeout)
   649  	deliveryCount := uint64(0)
   650  	received := NewBitset(numMessages)
   651  
   652  	handleMsgFun = func(msg *nats.Msg) {
   653  
   654  		subLock.Lock()
   655  		if msg.Sub != sub {
   656  			// Message from a previous instance of durable consumer, drop
   657  			defer subLock.Unlock()
   658  			return
   659  		}
   660  		subLock.Unlock()
   661  
   662  		deliveryCount += 1
   663  
   664  		metadata, err := msg.Metadata()
   665  		if err != nil {
   666  			t.Fatalf("Failed to get message metadata: %v", err)
   667  		}
   668  		seq := metadata.Sequence.Stream
   669  
   670  		var expectedMsgData = []byte(fmt.Sprintf("%d", seq))
   671  		if !bytes.Equal(msg.Data, expectedMsgData) {
   672  			t.Fatalf("Expecting message content '%s', got '%s' instead", expectedMsgData, msg.Data)
   673  		}
   674  
   675  		isDupe := received.get(seq - 1)
   676  
   677  		if isDupe {
   678  			if chaosConsumerTestsDebug {
   679  				t.Logf("Duplicate message delivery, seq: %d", seq)
   680  			}
   681  			return
   682  		}
   683  
   684  		// Mark this sequence as received
   685  		received.set(seq-1, true)
   686  		if received.count() < numMessages {
   687  			// Reset timeout
   688  			timeoutTimer.Reset(timeout)
   689  		} else {
   690  			// All received, speed up the shutdown
   691  			timeoutTimer.Reset(1 * time.Second)
   692  		}
   693  
   694  		// Simulate application processing (and gives the monkey some time to brew chaos)
   695  		time.Sleep(10 * time.Millisecond)
   696  
   697  	ackRetryLoop:
   698  		for i := 0; i <= maxRetries; i++ {
   699  			ackErr := msg.Ack()
   700  			if ackErr == nil {
   701  				break ackRetryLoop
   702  			} else if i == maxRetries {
   703  				t.Fatalf("Failed to ACK message %d (retried %d times)", seq, maxRetries)
   704  			} else {
   705  				time.Sleep(retryDelay)
   706  			}
   707  		}
   708  
   709  		if received.count()%numBatch == 0 {
   710  			t.Logf("Consumed %d/%d, duplicate deliveries: %d", received.count(), numMessages, deliveryCount-received.count())
   711  			// Close connection and resume consuming on a different one
   712  			resetDurableConsumer()
   713  		}
   714  	}
   715  
   716  	resetDurableConsumer()
   717  
   718  	chaos.start()
   719  	defer chaos.stop()
   720  
   721  	// Wait for long enough silence.
   722  	// Either a stall, or all messages received
   723  	<-timeoutTimer.C
   724  
   725  	// Shut down consumer
   726  	if sub != nil {
   727  		sub.Unsubscribe()
   728  	}
   729  
   730  	uniqueDeliveredCount := received.count()
   731  
   732  	t.Logf(
   733  		"Delivered %d/%d messages %d duplicate deliveries",
   734  		uniqueDeliveredCount,
   735  		numMessages,
   736  		deliveryCount-uniqueDeliveredCount,
   737  	)
   738  
   739  	if uniqueDeliveredCount != numMessages {
   740  		t.Fatalf("No new message delivered in the last %s, %d/%d messages never delivered", timeout, numMessages-uniqueDeliveredCount, numMessages)
   741  	}
   742  }
   743  
   744  func TestJetStreamChaosConsumerPull(t *testing.T) {
   745  
   746  	const numMessages = 5_000
   747  	const numBatch = 500
   748  	const maxRetries = 100
   749  	const retryDelay = 500 * time.Millisecond
   750  	const fetchTimeout = 250 * time.Millisecond
   751  	const fetchBatchSize = 100
   752  	const clusterSize = 3
   753  	const replicas = 3
   754  	const durableConsumerName = "durable"
   755  
   756  	c := createJetStreamClusterExplicit(t, chaosConsumerTestsClusterName, clusterSize)
   757  	defer c.shutdown()
   758  
   759  	createStreamForConsumerChaosTest(t, c, replicas, numMessages)
   760  
   761  	chaos := createClusterChaosMonkeyController(
   762  		t,
   763  		c,
   764  		&clusterBouncerChaosMonkey{
   765  			minDowntime:    0 * time.Second,
   766  			maxDowntime:    2 * time.Second,
   767  			minDownServers: clusterSize, // Whole cluster outage
   768  			maxDownServers: clusterSize,
   769  			pause:          1 * time.Second,
   770  		},
   771  	)
   772  
   773  	subNc, subJs := jsClientConnectCluster(t, c)
   774  	defer subNc.Close()
   775  
   776  	sub, err := subJs.PullSubscribe(
   777  		chaosConsumerTestsSubject,
   778  		durableConsumerName,
   779  	)
   780  	if err != nil {
   781  		t.Fatalf("Unexpected error: %v", err)
   782  	}
   783  	defer sub.Unsubscribe()
   784  
   785  	if chaosConsumerTestsDebug {
   786  		t.Logf("Initial subscription: %s", toIndentedJsonString(sub))
   787  	}
   788  
   789  	chaos.start()
   790  	defer chaos.stop()
   791  
   792  	fetchMaxWait := nats.MaxWait(fetchTimeout)
   793  	received := NewBitset(numMessages)
   794  	deliveredCount := uint64(0)
   795  
   796  	for received.count() < numMessages {
   797  
   798  		var msgs []*nats.Msg
   799  		var fetchErr error
   800  
   801  	fetchRetryLoop:
   802  		for r := 0; r <= maxRetries; r++ {
   803  			msgs, fetchErr = sub.Fetch(fetchBatchSize, fetchMaxWait)
   804  			if fetchErr == nil {
   805  				break fetchRetryLoop
   806  			} else if r == maxRetries {
   807  				t.Fatalf("Exceeded max retries for Fetch, last error: %s", fetchErr)
   808  			} else if fetchErr == nats.ErrBadSubscription {
   809  				t.Fatalf("Subscription is invalid: %s", toIndentedJsonString(sub))
   810  			} else {
   811  				// t.Logf("Fetch error: %v", fetchErr)
   812  				time.Sleep(retryDelay)
   813  			}
   814  		}
   815  
   816  		for _, msg := range msgs {
   817  
   818  			deliveredCount += 1
   819  
   820  			metadata, err := msg.Metadata()
   821  			if err != nil {
   822  				t.Fatalf("Failed to get message metadata: %v", err)
   823  			}
   824  
   825  			streamSeq := metadata.Sequence.Stream
   826  
   827  			expectedMsgData := []byte(fmt.Sprintf("%d", streamSeq))
   828  
   829  			if !bytes.Equal(msg.Data, expectedMsgData) {
   830  				t.Fatalf("Expecting message %s, got %s instead", expectedMsgData, msg.Data)
   831  			}
   832  
   833  			isDupe := received.get(streamSeq - 1)
   834  
   835  			received.set(streamSeq-1, true)
   836  
   837  			// Simulate application processing (and gives the monkey some time to brew chaos)
   838  			time.Sleep(10 * time.Millisecond)
   839  
   840  		ackRetryLoop:
   841  			for r := 0; r <= maxRetries; r++ {
   842  				ackErr := msg.Ack()
   843  				if ackErr == nil {
   844  					break ackRetryLoop
   845  				} else if r == maxRetries {
   846  					t.Fatalf("Failed to ACK message %d, last error: %s", streamSeq, ackErr)
   847  				} else {
   848  					time.Sleep(retryDelay)
   849  				}
   850  			}
   851  
   852  			if !isDupe && received.count()%numBatch == 0 {
   853  				t.Logf("Consumed %d/%d (duplicates: %d)", received.count(), numMessages, deliveredCount-received.count())
   854  			}
   855  		}
   856  	}
   857  }
   858  
   859  const (
   860  	chaosKvTestsClusterName = "KV_CHAOS_TEST"
   861  	chaosKvTestsBucketName  = "KV_CHAOS_TEST_BUCKET"
   862  	chaosKvTestsSubject     = "foo"
   863  	chaosKvTestsDebug       = false
   864  )
   865  
   866  // Creates KV store (a.k.a. bucket).
   867  func createBucketForKvChaosTest(t *testing.T, c *cluster, replicas int) {
   868  	t.Helper()
   869  
   870  	pubNc, pubJs := jsClientConnectCluster(t, c)
   871  	defer pubNc.Close()
   872  
   873  	config := nats.KeyValueConfig{
   874  		Bucket:      chaosKvTestsBucketName,
   875  		Replicas:    replicas,
   876  		Description: "Test bucket",
   877  	}
   878  
   879  	kvs, err := pubJs.CreateKeyValue(&config)
   880  	if err != nil {
   881  		t.Fatalf("Error creating bucket: %v", err)
   882  	}
   883  
   884  	status, err := kvs.Status()
   885  	if err != nil {
   886  		t.Fatalf("Error retrieving bucket status: %v", err)
   887  	}
   888  	t.Logf("Bucket created: %s", status.Bucket())
   889  }
   890  
   891  // Single client performs a set of PUT on a single key.
   892  // If PUT is successful, perform a GET on the same key.
   893  // If GET is successful, ensure key revision and value match the most recent successful write.
   894  func TestJetStreamChaosKvPutGet(t *testing.T) {
   895  
   896  	const numOps = 100_000
   897  	const clusterSize = 3
   898  	const replicas = 3
   899  	const key = "key"
   900  	const staleReadsOk = true // Set to false to check for violations of 'read committed' consistency
   901  
   902  	c := createJetStreamClusterExplicit(t, chaosKvTestsClusterName, clusterSize)
   903  	defer c.shutdown()
   904  
   905  	createBucketForKvChaosTest(t, c, replicas)
   906  
   907  	chaos := createClusterChaosMonkeyController(
   908  		t,
   909  		c,
   910  		&clusterBouncerChaosMonkey{
   911  			minDowntime:    0 * time.Second,
   912  			maxDowntime:    2 * time.Second,
   913  			minDownServers: clusterSize, // Whole cluster outage
   914  			maxDownServers: clusterSize,
   915  			pause:          1 * time.Second,
   916  		},
   917  	)
   918  
   919  	nc, js := jsClientConnectCluster(t, c)
   920  	defer nc.Close()
   921  
   922  	// Create KV bucket
   923  	kv, err := js.KeyValue(chaosKvTestsBucketName)
   924  	if err != nil {
   925  		t.Fatalf("Failed to get KV store: %v", err)
   926  	}
   927  
   928  	// Initialize the only key
   929  	firstRevision, err := kv.Create(key, []byte("INITIAL VALUE"))
   930  	if err != nil {
   931  		t.Fatalf("Failed to create key: %v", err)
   932  	} else if firstRevision != 1 {
   933  		t.Fatalf("Unexpected revision: %d", firstRevision)
   934  	}
   935  
   936  	// Start chaos
   937  	chaos.start()
   938  	defer chaos.stop()
   939  
   940  	staleReadsCount := uint64(0)
   941  	successCount := uint64(0)
   942  
   943  	previousRevision := firstRevision
   944  
   945  putGetLoop:
   946  	for i := 1; i <= numOps; i++ {
   947  
   948  		if i%1000 == 0 {
   949  			t.Logf("Completed %d/%d PUT+GET operations", i, numOps)
   950  		}
   951  
   952  		// PUT a value
   953  		putValue := fmt.Sprintf("value-%d", i)
   954  		putRevision, err := kv.Put(key, []byte(putValue))
   955  		if err != nil {
   956  			t.Logf("PUT error: %v", err)
   957  			continue putGetLoop
   958  		}
   959  
   960  		// Check revision is monotonically increasing
   961  		if putRevision <= previousRevision {
   962  			t.Fatalf("PUT produced revision %d which is not greater than the previous successful PUT revision: %d", putRevision, previousRevision)
   963  		}
   964  
   965  		previousRevision = putRevision
   966  
   967  		// If PUT was successful, GET the same
   968  		kve, err := kv.Get(key)
   969  		if err == nats.ErrKeyNotFound {
   970  			t.Fatalf("GET key not found, but key does exists (last PUT revision: %d)", putRevision)
   971  		} else if err != nil {
   972  			t.Logf("GET error: %v", err)
   973  			continue putGetLoop
   974  		}
   975  
   976  		getValue := string(kve.Value())
   977  		getRevision := kve.Revision()
   978  
   979  		if putRevision > getRevision {
   980  			// Stale read, violates 'read committed' consistency criteria
   981  			if !staleReadsOk {
   982  				t.Fatalf("PUT value %s (rev: %d) then read value %s (rev: %d)", putValue, putRevision, getValue, getRevision)
   983  			} else {
   984  				staleReadsCount += 1
   985  			}
   986  		} else if putRevision < getRevision {
   987  			// Returned revision is higher than any ever written, this should never happen
   988  			t.Fatalf("GET returned revision %d, but most recent expected revision is %d", getRevision, putRevision)
   989  		} else if putValue != getValue {
   990  			// Returned revision matches latest, but values do not, this should never happen
   991  			t.Fatalf("GET returned revision %d with value %s, but value %s was just committed for that same revision", getRevision, getValue, putValue)
   992  		} else {
   993  			// Get returned the latest revision/value
   994  			successCount += 1
   995  			if chaosKvTestsDebug {
   996  				t.Logf("PUT+GET %s=%s (rev: %d)", key, putValue, putRevision)
   997  			}
   998  		}
   999  	}
  1000  
  1001  	t.Logf("Completed %d PUT+GET cycles of which %d successful, %d GETs returned a stale value", numOps, successCount, staleReadsCount)
  1002  }
  1003  
  1004  // A variant TestJetStreamChaosKvPutGet where PUT is retried until successful, and GET is retried until it returns the latest known key revision.
  1005  // This validates than a confirmed PUT value is never lost, and becomes eventually visible.
  1006  func TestJetStreamChaosKvPutGetWithRetries(t *testing.T) {
  1007  
  1008  	const numOps = 10_000
  1009  	const maxRetries = 20
  1010  	const retryDelay = 100 * time.Millisecond
  1011  	const clusterSize = 3
  1012  	const replicas = 3
  1013  	const key = "key"
  1014  
  1015  	c := createJetStreamClusterExplicit(t, chaosKvTestsClusterName, clusterSize)
  1016  	defer c.shutdown()
  1017  
  1018  	createBucketForKvChaosTest(t, c, replicas)
  1019  
  1020  	chaos := createClusterChaosMonkeyController(
  1021  		t,
  1022  		c,
  1023  		&clusterBouncerChaosMonkey{
  1024  			minDowntime:    0 * time.Second,
  1025  			maxDowntime:    2 * time.Second,
  1026  			minDownServers: clusterSize, // Whole cluster outage
  1027  			maxDownServers: clusterSize,
  1028  			pause:          1 * time.Second,
  1029  		},
  1030  	)
  1031  
  1032  	nc, js := jsClientConnectCluster(t, c)
  1033  	defer nc.Close()
  1034  
  1035  	kv, err := js.KeyValue(chaosKvTestsBucketName)
  1036  	if err != nil {
  1037  		t.Fatalf("Failed to get KV store: %v", err)
  1038  	}
  1039  
  1040  	// Initialize key value
  1041  	firstRevision, err := kv.Create(key, []byte("INITIAL VALUE"))
  1042  	if err != nil {
  1043  		t.Fatalf("Failed to create key: %v", err)
  1044  	} else if firstRevision != 1 {
  1045  		t.Fatalf("Unexpected revision: %d", firstRevision)
  1046  	}
  1047  
  1048  	// Start chaos
  1049  	chaos.start()
  1050  	defer chaos.stop()
  1051  
  1052  	staleReadCount := 0
  1053  	previousRevision := firstRevision
  1054  
  1055  putGetLoop:
  1056  	for i := 1; i <= numOps; i++ {
  1057  
  1058  		if i%1000 == 0 {
  1059  			t.Logf("Completed %d/%d PUT+GET operations", i, numOps)
  1060  		}
  1061  
  1062  		putValue := fmt.Sprintf("value-%d", i)
  1063  		putRevision := uint64(0)
  1064  
  1065  		// Put new value for key, retry until successful or out of retries
  1066  	putRetryLoop:
  1067  		for r := 0; r <= maxRetries; r++ {
  1068  			var putErr error
  1069  			putRevision, putErr = kv.Put(key, []byte(putValue))
  1070  			if putErr == nil {
  1071  				break putRetryLoop
  1072  			} else if r == maxRetries {
  1073  				t.Fatalf("Failed to PUT (retried %d times): %v", maxRetries, putErr)
  1074  			} else {
  1075  				if chaosKvTestsDebug {
  1076  					t.Logf("PUT error: %v", putErr)
  1077  				}
  1078  				time.Sleep(retryDelay)
  1079  			}
  1080  		}
  1081  
  1082  		// Ensure key version is monotonically increasing
  1083  		if putRevision <= previousRevision {
  1084  			t.Fatalf("Latest PUT created revision %d which is not greater than the previous revision: %d", putRevision, previousRevision)
  1085  		}
  1086  		previousRevision = putRevision
  1087  
  1088  		// Read value for key, retry until successful, and validate corresponding version and value
  1089  	getRetryLoop:
  1090  		for r := 0; r <= maxRetries; r++ {
  1091  			var getErr error
  1092  			kve, getErr := kv.Get(key)
  1093  			if getErr != nil && r == maxRetries {
  1094  				t.Fatalf("Failed to GET (retried %d times): %v", maxRetries, getErr)
  1095  			} else if getErr != nil {
  1096  				if chaosKvTestsDebug {
  1097  					t.Logf("GET error: %v", getErr)
  1098  				}
  1099  				time.Sleep(retryDelay)
  1100  				continue getRetryLoop
  1101  			}
  1102  
  1103  			// GET successful, check revision and value
  1104  			getValue := string(kve.Value())
  1105  			getRevision := kve.Revision()
  1106  
  1107  			if putRevision == getRevision {
  1108  				if putValue != getValue {
  1109  					t.Fatalf("Unexpected value %s for revision %d, expected: %s", getValue, getRevision, putValue)
  1110  				}
  1111  				if chaosKvTestsDebug {
  1112  					t.Logf("PUT+GET %s=%s (rev: %d) (retry: %d)", key, putValue, putRevision, r)
  1113  				}
  1114  				continue putGetLoop
  1115  			} else if getRevision > putRevision {
  1116  				t.Fatalf("GET returned version that should not exist yet: %d, last created: %d", getRevision, putRevision)
  1117  			} else { // get revision < put revision
  1118  				staleReadCount += 1
  1119  				if chaosKvTestsDebug {
  1120  					t.Logf("GET got stale value: %v (rev: %d, latest: %d)", getValue, getRevision, putRevision)
  1121  				}
  1122  				time.Sleep(retryDelay)
  1123  				continue getRetryLoop
  1124  			}
  1125  		}
  1126  	}
  1127  
  1128  	t.Logf("Client completed %d PUT+GET cycles, %d GET returned a stale value", numOps, staleReadCount)
  1129  }
  1130  
  1131  // Multiple clients updating a finite set of keys with CAS semantics.
  1132  // TODO check that revision is never lower than last one seen
  1133  // TODO check that KeyNotFound is never returned, as keys are initialized beforehand
  1134  func TestJetStreamChaosKvCAS(t *testing.T) {
  1135  	const numOps = 10_000
  1136  	const maxRetries = 50
  1137  	const retryDelay = 300 * time.Millisecond
  1138  	const clusterSize = 3
  1139  	const replicas = 3
  1140  	const numKeys = 15
  1141  	const numClients = 5
  1142  
  1143  	c := createJetStreamClusterExplicit(t, chaosKvTestsClusterName, clusterSize)
  1144  	defer c.shutdown()
  1145  
  1146  	createBucketForKvChaosTest(t, c, replicas)
  1147  
  1148  	chaos := createClusterChaosMonkeyController(
  1149  		t,
  1150  		c,
  1151  		&clusterBouncerChaosMonkey{
  1152  			minDowntime:    0 * time.Second,
  1153  			maxDowntime:    2 * time.Second,
  1154  			minDownServers: clusterSize, // Whole cluster outage
  1155  			maxDownServers: clusterSize,
  1156  			pause:          1 * time.Second,
  1157  		},
  1158  	)
  1159  
  1160  	nc, js := jsClientConnectCluster(t, c)
  1161  	defer nc.Close()
  1162  
  1163  	// Create bucket
  1164  	kv, err := js.KeyValue(chaosKvTestsBucketName)
  1165  	if err != nil {
  1166  		t.Fatalf("Failed to get KV store: %v", err)
  1167  	}
  1168  
  1169  	// Create set of keys and initialize them with dummy value
  1170  	keys := make([]string, numKeys)
  1171  	for k := 0; k < numKeys; k++ {
  1172  		key := fmt.Sprintf("key-%d", k)
  1173  		keys[k] = key
  1174  
  1175  		_, err := kv.Create(key, []byte("Initial value"))
  1176  		if err != nil {
  1177  			t.Fatalf("Failed to create key: %v", err)
  1178  		}
  1179  	}
  1180  
  1181  	wgStart := sync.WaitGroup{}
  1182  	wgComplete := sync.WaitGroup{}
  1183  
  1184  	// Client routine
  1185  	client := func(clientId int, kv nats.KeyValue) {
  1186  		defer wgComplete.Done()
  1187  
  1188  		rng := rand.New(rand.NewSource(int64(clientId)))
  1189  		successfulUpdates := 0
  1190  		casRejectUpdates := 0
  1191  		otherUpdateErrors := 0
  1192  
  1193  		// Map to track last known revision for each of the keys
  1194  		knownRevisions := map[string]uint64{}
  1195  		for _, key := range keys {
  1196  			knownRevisions[key] = 0
  1197  		}
  1198  
  1199  		// Wait for all clients to reach this point before proceeding
  1200  		wgStart.Done()
  1201  		wgStart.Wait()
  1202  
  1203  		for i := 1; i <= numOps; i++ {
  1204  
  1205  			if i%1000 == 0 {
  1206  				t.Logf("Client %d completed %d/%d updates", clientId, i, numOps)
  1207  			}
  1208  
  1209  			// Pick random key from the set
  1210  			key := keys[rng.Intn(numKeys)]
  1211  
  1212  			// Prepare unique value to be written
  1213  			value := fmt.Sprintf("client: %d operation %d", clientId, i)
  1214  
  1215  			// Try to update a key with CAS
  1216  			newRevision, updateErr := kv.Update(key, []byte(value), knownRevisions[key])
  1217  			if updateErr == nil {
  1218  				// Update successful
  1219  				knownRevisions[key] = newRevision
  1220  				successfulUpdates += 1
  1221  				if chaosKvTestsDebug {
  1222  					t.Logf("Client %d updated key %s, new revision: %d", clientId, key, newRevision)
  1223  				}
  1224  			} else if updateErr != nil && strings.Contains(fmt.Sprint(updateErr), "wrong last sequence") {
  1225  				// CAS rejected update, learn current revision for this key
  1226  				casRejectUpdates += 1
  1227  
  1228  				for r := 0; r <= maxRetries; r++ {
  1229  					kve, getErr := kv.Get(key)
  1230  					if getErr == nil {
  1231  						currentRevision := kve.Revision()
  1232  						if currentRevision < knownRevisions[key] {
  1233  							// Revision number moved backward, this should never happen
  1234  							t.Fatalf("Current revision for key %s is %d, which is lower than the last known revision %d", key, currentRevision, knownRevisions[key])
  1235  
  1236  						}
  1237  
  1238  						knownRevisions[key] = currentRevision
  1239  						if chaosKvTestsDebug {
  1240  							t.Logf("Client %d learn key %s revision: %d", clientId, key, currentRevision)
  1241  						}
  1242  						break
  1243  					} else if r == maxRetries {
  1244  						t.Fatalf("Failed to GET (retried %d times): %v", maxRetries, getErr)
  1245  					} else {
  1246  						time.Sleep(retryDelay)
  1247  					}
  1248  				}
  1249  			} else {
  1250  				// Other update error
  1251  				otherUpdateErrors += 1
  1252  				if chaosKvTestsDebug {
  1253  					t.Logf("Client %d update error for key %s: %v", clientId, key, updateErr)
  1254  				}
  1255  				time.Sleep(retryDelay)
  1256  			}
  1257  		}
  1258  		t.Logf("Client %d done, %d kv updates, %d CAS rejected, %d other errors", clientId, successfulUpdates, casRejectUpdates, otherUpdateErrors)
  1259  	}
  1260  
  1261  	// Launch all clients
  1262  	for i := 1; i <= numClients; i++ {
  1263  		cNc, cJs := jsClientConnectCluster(t, c)
  1264  		defer cNc.Close()
  1265  
  1266  		cKv, err := cJs.KeyValue(chaosKvTestsBucketName)
  1267  		if err != nil {
  1268  			t.Fatalf("Failed to get KV store: %v", err)
  1269  		}
  1270  
  1271  		wgStart.Add(1)
  1272  		wgComplete.Add(1)
  1273  		go client(i, cKv)
  1274  	}
  1275  
  1276  	// Wait for clients to be connected and ready
  1277  	wgStart.Wait()
  1278  
  1279  	// Start failures
  1280  	chaos.start()
  1281  	defer chaos.stop()
  1282  
  1283  	// Wait for all clients to be done
  1284  	wgComplete.Wait()
  1285  }