github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/jetstream_chaos_test.go (about) 1 // Copyright 2023 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 //go:build js_chaos_tests 15 // +build js_chaos_tests 16 17 package server 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "fmt" 23 "math/rand" 24 "strings" 25 "sync" 26 "testing" 27 "time" 28 29 "github.com/nats-io/nats.go" 30 ) 31 32 // Support functions for "chaos" testing (random injected failures) 33 34 type ChaosMonkeyController interface { 35 // Launch the monkey as background routine and return 36 start() 37 // Stop a monkey that was previously started 38 stop() 39 // Run the monkey synchronously, until it is manually stopped via stopCh 40 run() 41 } 42 43 type ClusterChaosMonkey interface { 44 // Set defaults and validates the monkey parameters 45 validate(t *testing.T, c *cluster) 46 // Run the monkey synchronously, until it is manually stopped via stopCh 47 run(t *testing.T, c *cluster, stopCh <-chan bool) 48 } 49 50 // Chaos Monkey Controller that acts on a cluster 51 type clusterChaosMonkeyController struct { 52 t *testing.T 53 cluster *cluster 54 wg sync.WaitGroup 55 stopCh chan bool 56 ccm ClusterChaosMonkey 57 } 58 59 func createClusterChaosMonkeyController(t *testing.T, c *cluster, ccm ClusterChaosMonkey) ChaosMonkeyController { 60 ccm.validate(t, c) 61 return &clusterChaosMonkeyController{ 62 t: t, 63 cluster: c, 64 stopCh: make(chan bool, 3), 65 ccm: ccm, 66 } 67 } 68 69 func (m *clusterChaosMonkeyController) start() { 70 m.t.Logf("🐵 Starting monkey") 71 m.wg.Add(1) 72 go func() { 73 defer m.wg.Done() 74 m.run() 75 }() 76 } 77 78 func (m *clusterChaosMonkeyController) stop() { 79 m.t.Logf("🐵 Stopping monkey") 80 m.stopCh <- true 81 m.wg.Wait() 82 m.t.Logf("🐵 Monkey stopped") 83 } 84 85 func (m *clusterChaosMonkeyController) run() { 86 m.ccm.run(m.t, m.cluster, m.stopCh) 87 } 88 89 // Cluster Chaos Monkey that selects a random subset of the nodes in a cluster (according to min/max provided), 90 // shuts them down for a given duration (according to min/max provided), then brings them back up. 91 // Then sleeps for a given time, and does it again until stopped. 92 type clusterBouncerChaosMonkey struct { 93 minDowntime time.Duration 94 maxDowntime time.Duration 95 minDownServers int 96 maxDownServers int 97 pause time.Duration 98 } 99 100 func (m *clusterBouncerChaosMonkey) validate(t *testing.T, c *cluster) { 101 if m.minDowntime > m.maxDowntime { 102 t.Fatalf("Min downtime %v cannot be larger than max downtime %v", m.minDowntime, m.maxDowntime) 103 } 104 105 if m.minDownServers > m.maxDownServers { 106 t.Fatalf("Min down servers %v cannot be larger than max down servers %v", m.minDownServers, m.maxDownServers) 107 } 108 } 109 110 func (m *clusterBouncerChaosMonkey) run(t *testing.T, c *cluster, stopCh <-chan bool) { 111 for { 112 // Pause between actions 113 select { 114 case <-stopCh: 115 return 116 case <-time.After(m.pause): 117 } 118 119 // Pick a random subset of servers 120 numServersDown := rand.Intn(1+m.maxDownServers-m.minDownServers) + m.minDownServers 121 servers := c.selectRandomServers(numServersDown) 122 serverNames := []string{} 123 for _, s := range servers { 124 serverNames = append(serverNames, s.info.Name) 125 } 126 127 // Pick a random outage interval 128 minOutageNanos := m.minDowntime.Nanoseconds() 129 maxOutageNanos := m.maxDowntime.Nanoseconds() 130 outageDurationNanos := rand.Int63n(1+maxOutageNanos-minOutageNanos) + minOutageNanos 131 outageDuration := time.Duration(outageDurationNanos) 132 133 // Take down selected servers 134 t.Logf("🐵 Taking down %d/%d servers for %v (%v)", numServersDown, len(c.servers), outageDuration, serverNames) 135 c.stopSubset(servers) 136 137 // Wait for the "outage" duration 138 select { 139 case <-stopCh: 140 return 141 case <-time.After(outageDuration): 142 } 143 144 // Restart servers and wait for cluster to be healthy 145 t.Logf("🐵 Restoring cluster") 146 c.restartAllSamePorts() 147 c.waitOnClusterHealthz() 148 149 c.waitOnClusterReady() 150 c.waitOnAllCurrent() 151 c.waitOnLeader() 152 } 153 } 154 155 // Additional cluster methods for chaos testing 156 157 func (c *cluster) waitOnClusterHealthz() { 158 c.t.Helper() 159 for _, cs := range c.servers { 160 c.waitOnServerHealthz(cs) 161 } 162 } 163 164 func (c *cluster) stopSubset(toStop []*Server) { 165 c.t.Helper() 166 for _, s := range toStop { 167 s.Shutdown() 168 } 169 } 170 171 func (c *cluster) selectRandomServers(numServers int) []*Server { 172 c.t.Helper() 173 if numServers > len(c.servers) { 174 panic(fmt.Sprintf("Can't select %d servers in a cluster of %d", numServers, len(c.servers))) 175 } 176 var selectedServers []*Server 177 selectedServers = append(selectedServers, c.servers...) 178 rand.Shuffle(len(selectedServers), func(x, y int) { 179 selectedServers[x], selectedServers[y] = selectedServers[y], selectedServers[x] 180 }) 181 return selectedServers[0:numServers] 182 } 183 184 // Other helpers 185 186 func jsClientConnectCluster(t testing.TB, c *cluster) (*nats.Conn, nats.JetStreamContext) { 187 serverConnectURLs := make([]string, len(c.servers)) 188 for i, server := range c.servers { 189 serverConnectURLs[i] = server.ClientURL() 190 } 191 connectURL := strings.Join(serverConnectURLs, ",") 192 193 nc, err := nats.Connect(connectURL) 194 if err != nil { 195 t.Fatalf("Failed to connect: %s", err) 196 } 197 198 js, err := nc.JetStream() 199 if err != nil { 200 t.Fatalf("Failed to init JetStream context: %s", err) 201 } 202 203 return nc, js 204 } 205 206 func toIndentedJsonString(v any) string { 207 s, err := json.MarshalIndent(v, "", " ") 208 if err != nil { 209 panic(err) 210 } 211 return string(s) 212 } 213 214 // Bounces the entire set of nodes, then brings them back up. 215 // Fail if some nodes don't come back online. 216 func TestJetStreamChaosClusterBounce(t *testing.T) { 217 218 const duration = 60 * time.Second 219 const clusterSize = 3 220 221 c := createJetStreamClusterExplicit(t, "R3", clusterSize) 222 defer c.shutdown() 223 224 chaos := createClusterChaosMonkeyController( 225 t, 226 c, 227 &clusterBouncerChaosMonkey{ 228 minDowntime: 0 * time.Second, 229 maxDowntime: 2 * time.Second, 230 minDownServers: clusterSize, 231 maxDownServers: clusterSize, 232 pause: 3 * time.Second, 233 }, 234 ) 235 chaos.start() 236 defer chaos.stop() 237 238 <-time.After(duration) 239 } 240 241 // Bounces a subset of the nodes, then brings them back up. 242 // Fails if some nodes don't come back online. 243 func TestJetStreamChaosClusterBounceSubset(t *testing.T) { 244 245 const duration = 60 * time.Second 246 const clusterSize = 3 247 248 c := createJetStreamClusterExplicit(t, "R3", clusterSize) 249 defer c.shutdown() 250 251 chaos := createClusterChaosMonkeyController( 252 t, 253 c, 254 &clusterBouncerChaosMonkey{ 255 minDowntime: 0 * time.Second, 256 maxDowntime: 2 * time.Second, 257 minDownServers: 1, 258 maxDownServers: clusterSize, 259 pause: 3 * time.Second, 260 }, 261 ) 262 chaos.start() 263 defer chaos.stop() 264 265 <-time.After(duration) 266 } 267 268 const ( 269 chaosConsumerTestsClusterName = "CONSUMERS_CHAOS_TEST" 270 chaosConsumerTestsStreamName = "CONSUMER_CHAOS_TEST_STREAM" 271 chaosConsumerTestsSubject = "foo" 272 chaosConsumerTestsDebug = false 273 ) 274 275 // Creates stream and fills it with the given number of messages. 276 // Each message is the string representation of the stream sequence number, 277 // e.g. the first message (seqno: 1) contains data "1". 278 // This allows consumers to verify the content of each message without tracking additional state 279 func createStreamForConsumerChaosTest(t *testing.T, c *cluster, replicas, numMessages int) { 280 t.Helper() 281 282 const publishBatchSize = 1_000 283 284 pubNc, pubJs := jsClientConnectCluster(t, c) 285 defer pubNc.Close() 286 287 _, err := pubJs.AddStream(&nats.StreamConfig{ 288 Name: chaosConsumerTestsStreamName, 289 Subjects: []string{chaosConsumerTestsSubject}, 290 Replicas: replicas, 291 }) 292 if err != nil { 293 t.Fatalf("Error creating stream: %v", err) 294 } 295 296 ackFutures := make([]nats.PubAckFuture, 0, publishBatchSize) 297 298 for i := 1; i <= numMessages; i++ { 299 message := []byte(fmt.Sprintf("%d", i)) 300 pubAckFuture, err := pubJs.PublishAsync(chaosConsumerTestsSubject, message, nats.ExpectLastSequence(uint64(i-1))) 301 if err != nil { 302 t.Fatalf("Publish error: %s", err) 303 } 304 ackFutures = append(ackFutures, pubAckFuture) 305 306 if (i > 0 && i%publishBatchSize == 0) || i == numMessages { 307 select { 308 case <-pubJs.PublishAsyncComplete(): 309 for _, pubAckFuture := range ackFutures { 310 select { 311 case <-pubAckFuture.Ok(): 312 // Noop 313 case pubAckErr := <-pubAckFuture.Err(): 314 t.Fatalf("Error publishing: %s", pubAckErr) 315 case <-time.After(30 * time.Second): 316 t.Fatalf("Timeout verifying pubAck for message: %s", pubAckFuture.Msg().Data) 317 } 318 } 319 ackFutures = make([]nats.PubAckFuture, 0, publishBatchSize) 320 t.Logf("Published %d/%d messages", i, numMessages) 321 322 case <-time.After(30 * time.Second): 323 t.Fatalf("Publish timed out") 324 } 325 } 326 } 327 } 328 329 // Verify ordered delivery despite cluster-wide outages 330 func TestJetStreamChaosConsumerOrdered(t *testing.T) { 331 332 const numMessages = 5_000 333 const numBatch = 500 334 const maxRetries = 100 335 const retryDelay = 500 * time.Millisecond 336 const fetchTimeout = 250 * time.Millisecond 337 const clusterSize = 3 338 const replicas = 3 339 340 c := createJetStreamClusterExplicit(t, chaosConsumerTestsClusterName, clusterSize) 341 defer c.shutdown() 342 343 createStreamForConsumerChaosTest(t, c, replicas, numMessages) 344 345 chaos := createClusterChaosMonkeyController( 346 t, 347 c, 348 &clusterBouncerChaosMonkey{ 349 minDowntime: 0 * time.Second, 350 maxDowntime: 2 * time.Second, 351 minDownServers: clusterSize, // Whole cluster outage 352 maxDownServers: clusterSize, 353 pause: 1 * time.Second, 354 }, 355 ) 356 357 subNc, subJs := jsClientConnectCluster(t, c) 358 defer subNc.Close() 359 360 sub, err := subJs.SubscribeSync( 361 chaosConsumerTestsSubject, 362 nats.OrderedConsumer(), 363 ) 364 if err != nil { 365 t.Fatalf("Unexpected error: %v", err) 366 } 367 defer sub.Unsubscribe() 368 369 if chaosConsumerTestsDebug { 370 t.Logf("Initial subscription: %s", toIndentedJsonString(sub)) 371 } 372 373 chaos.start() 374 defer chaos.stop() 375 376 for i := 1; i <= numMessages; i++ { 377 var msg *nats.Msg 378 var nextMsgErr error 379 var expectedMsgData = []byte(fmt.Sprintf("%d", i)) 380 381 nextMsgRetryLoop: 382 for r := 0; r <= maxRetries; r++ { 383 msg, nextMsgErr = sub.NextMsg(fetchTimeout) 384 if nextMsgErr == nil { 385 break nextMsgRetryLoop 386 } else if r == maxRetries { 387 t.Fatalf("Exceeded max retries for NextMsg") 388 } else if nextMsgErr == nats.ErrBadSubscription { 389 t.Fatalf("Subscription is invalid: %s", toIndentedJsonString(sub)) 390 } else { 391 time.Sleep(retryDelay) 392 } 393 } 394 395 metadata, err := msg.Metadata() 396 if err != nil { 397 t.Fatalf("Failed to get message metadata: %v", err) 398 } 399 400 if metadata.Sequence.Stream != uint64(i) { 401 t.Fatalf("Expecting stream sequence %d, got %d instead", i, metadata.Sequence.Stream) 402 } 403 404 if !bytes.Equal(msg.Data, expectedMsgData) { 405 t.Fatalf("Expecting message %s, got %s instead", expectedMsgData, msg.Data) 406 } 407 408 // Simulate application processing (and gives the monkey some time to brew chaos) 409 time.Sleep(10 * time.Millisecond) 410 411 if i%numBatch == 0 { 412 t.Logf("Consumed %d/%d", i, numMessages) 413 } 414 } 415 } 416 417 // Verify ordered delivery despite cluster-wide outages 418 func TestJetStreamChaosConsumerAsync(t *testing.T) { 419 420 const numMessages = 5_000 421 const numBatch = 500 422 const timeout = 30 * time.Second // No (new) messages for 30s => terminate 423 const maxRetries = 25 424 const retryDelay = 500 * time.Millisecond 425 const clusterSize = 3 426 const replicas = 3 427 428 c := createJetStreamClusterExplicit(t, chaosConsumerTestsClusterName, clusterSize) 429 defer c.shutdown() 430 431 createStreamForConsumerChaosTest(t, c, replicas, numMessages) 432 433 chaos := createClusterChaosMonkeyController( 434 t, 435 c, 436 &clusterBouncerChaosMonkey{ 437 minDowntime: 0 * time.Second, 438 maxDowntime: 2 * time.Second, 439 minDownServers: clusterSize, 440 maxDownServers: clusterSize, 441 pause: 2 * time.Second, 442 }, 443 ) 444 445 subNc, subJs := jsClientConnectCluster(t, c) 446 defer subNc.Close() 447 448 timeoutTimer := time.NewTimer(timeout) 449 deliveryCount := uint64(0) 450 received := NewBitset(numMessages) 451 452 handleMsg := func(msg *nats.Msg) { 453 deliveryCount += 1 454 455 metadata, err := msg.Metadata() 456 if err != nil { 457 t.Fatalf("Failed to get message metadata: %v", err) 458 } 459 seq := metadata.Sequence.Stream 460 461 var expectedMsgData = []byte(fmt.Sprintf("%d", seq)) 462 if !bytes.Equal(msg.Data, expectedMsgData) { 463 t.Fatalf("Expecting message content '%s', got '%s' instead", expectedMsgData, msg.Data) 464 } 465 466 isDupe := received.get(seq - 1) 467 468 if isDupe { 469 if chaosConsumerTestsDebug { 470 t.Logf("Duplicate message delivery, seq: %d", seq) 471 } 472 return 473 } 474 475 // Mark this sequence as received 476 received.set(seq-1, true) 477 if received.count() < numMessages { 478 // Reset timeout 479 timeoutTimer.Reset(timeout) 480 } else { 481 // All received, speed up the shutdown 482 timeoutTimer.Reset(1 * time.Second) 483 } 484 485 if received.count()%numBatch == 0 { 486 t.Logf("Consumed %d/%d", received.count(), numMessages) 487 } 488 489 // Simulate application processing (and gives the monkey some time to brew chaos) 490 time.Sleep(10 * time.Millisecond) 491 492 ackRetryLoop: 493 for i := 0; i <= maxRetries; i++ { 494 ackErr := msg.Ack() 495 if ackErr == nil { 496 break ackRetryLoop 497 } else if i == maxRetries { 498 t.Fatalf("Failed to ACK message %d (retried %d times)", seq, maxRetries) 499 } else { 500 time.Sleep(retryDelay) 501 } 502 } 503 } 504 505 subOpts := []nats.SubOpt{} 506 sub, err := subJs.Subscribe(chaosConsumerTestsSubject, handleMsg, subOpts...) 507 if err != nil { 508 t.Fatalf("Unexpected error: %v", err) 509 } 510 defer sub.Unsubscribe() 511 512 chaos.start() 513 defer chaos.stop() 514 515 // Wait for long enough silence. 516 // Either a stall, or all messages received 517 <-timeoutTimer.C 518 519 // Shut down consumer 520 sub.Unsubscribe() 521 522 uniqueDeliveredCount := received.count() 523 524 t.Logf( 525 "Delivered %d/%d messages %d duplicate deliveries", 526 uniqueDeliveredCount, 527 numMessages, 528 deliveryCount-uniqueDeliveredCount, 529 ) 530 531 if uniqueDeliveredCount != numMessages { 532 t.Fatalf("No new message delivered in the last %s, %d/%d messages never delivered", timeout, numMessages-uniqueDeliveredCount, numMessages) 533 } 534 } 535 536 // Verify durable consumer retains state despite cluster-wide outages 537 // The consumer connection is also periodically closed, and the consumer 'resumes' on a different one 538 func TestJetStreamChaosConsumerDurable(t *testing.T) { 539 540 const numMessages = 5_000 541 const numBatch = 500 542 const timeout = 30 * time.Second // No (new) messages for 60s => terminate 543 const clusterSize = 3 544 const replicas = 3 545 const maxRetries = 25 546 const retryDelay = 500 * time.Millisecond 547 const durableConsumerName = "durable" 548 549 c := createJetStreamClusterExplicit(t, chaosConsumerTestsClusterName, clusterSize) 550 defer c.shutdown() 551 552 createStreamForConsumerChaosTest(t, c, replicas, numMessages) 553 554 chaos := createClusterChaosMonkeyController( 555 t, 556 c, 557 &clusterBouncerChaosMonkey{ 558 minDowntime: 0 * time.Second, 559 maxDowntime: 2 * time.Second, 560 minDownServers: 1, 561 maxDownServers: clusterSize, 562 pause: 3 * time.Second, 563 }, 564 ) 565 566 var nc *nats.Conn 567 var sub *nats.Subscription 568 var subLock sync.Mutex 569 570 var handleMsgFun func(msg *nats.Msg) 571 var natsURL string 572 573 { 574 var sb strings.Builder 575 for _, s := range c.servers { 576 sb.WriteString(s.ClientURL()) 577 sb.WriteString(",") 578 } 579 natsURL = sb.String() 580 } 581 582 resetDurableConsumer := func() { 583 subLock.Lock() 584 defer subLock.Unlock() 585 586 if nc != nil { 587 nc.Close() 588 } 589 590 var newNc *nats.Conn 591 connectRetryLoop: 592 for r := 0; r <= maxRetries; r++ { 593 var connErr error 594 newNc, connErr = nats.Connect(natsURL) 595 if connErr == nil { 596 break connectRetryLoop 597 } else if r == maxRetries { 598 t.Fatalf("Failed to connect, exceeded max retries, last error: %s", connErr) 599 } else { 600 time.Sleep(retryDelay) 601 } 602 } 603 604 var newJs nats.JetStreamContext 605 jsRetryLoop: 606 for r := 0; r <= maxRetries; r++ { 607 var jsErr error 608 newJs, jsErr = newNc.JetStream(nats.MaxWait(10 * time.Second)) 609 if jsErr == nil { 610 break jsRetryLoop 611 } else if r == maxRetries { 612 t.Fatalf("Failed to get JS, exceeded max retries, last error: %s", jsErr) 613 } else { 614 time.Sleep(retryDelay) 615 } 616 } 617 618 subOpts := []nats.SubOpt{ 619 nats.Durable(durableConsumerName), 620 } 621 622 var newSub *nats.Subscription 623 subscribeRetryLoop: 624 for i := 0; i <= maxRetries; i++ { 625 var subErr error 626 newSub, subErr = newJs.Subscribe(chaosConsumerTestsSubject, handleMsgFun, subOpts...) 627 if subErr == nil { 628 ci, err := newJs.ConsumerInfo(chaosConsumerTestsStreamName, durableConsumerName) 629 if err == nil { 630 if chaosConsumerTestsDebug { 631 t.Logf("Consumer info:\n %s", toIndentedJsonString(ci)) 632 } 633 } else { 634 t.Logf("Failed to retrieve consumer info: %s", err) 635 } 636 637 break subscribeRetryLoop 638 } else if i == maxRetries { 639 t.Fatalf("Exceeded max retries creating subscription: %v", subErr) 640 } else { 641 time.Sleep(retryDelay) 642 } 643 } 644 645 nc, sub = newNc, newSub 646 } 647 648 timeoutTimer := time.NewTimer(timeout) 649 deliveryCount := uint64(0) 650 received := NewBitset(numMessages) 651 652 handleMsgFun = func(msg *nats.Msg) { 653 654 subLock.Lock() 655 if msg.Sub != sub { 656 // Message from a previous instance of durable consumer, drop 657 defer subLock.Unlock() 658 return 659 } 660 subLock.Unlock() 661 662 deliveryCount += 1 663 664 metadata, err := msg.Metadata() 665 if err != nil { 666 t.Fatalf("Failed to get message metadata: %v", err) 667 } 668 seq := metadata.Sequence.Stream 669 670 var expectedMsgData = []byte(fmt.Sprintf("%d", seq)) 671 if !bytes.Equal(msg.Data, expectedMsgData) { 672 t.Fatalf("Expecting message content '%s', got '%s' instead", expectedMsgData, msg.Data) 673 } 674 675 isDupe := received.get(seq - 1) 676 677 if isDupe { 678 if chaosConsumerTestsDebug { 679 t.Logf("Duplicate message delivery, seq: %d", seq) 680 } 681 return 682 } 683 684 // Mark this sequence as received 685 received.set(seq-1, true) 686 if received.count() < numMessages { 687 // Reset timeout 688 timeoutTimer.Reset(timeout) 689 } else { 690 // All received, speed up the shutdown 691 timeoutTimer.Reset(1 * time.Second) 692 } 693 694 // Simulate application processing (and gives the monkey some time to brew chaos) 695 time.Sleep(10 * time.Millisecond) 696 697 ackRetryLoop: 698 for i := 0; i <= maxRetries; i++ { 699 ackErr := msg.Ack() 700 if ackErr == nil { 701 break ackRetryLoop 702 } else if i == maxRetries { 703 t.Fatalf("Failed to ACK message %d (retried %d times)", seq, maxRetries) 704 } else { 705 time.Sleep(retryDelay) 706 } 707 } 708 709 if received.count()%numBatch == 0 { 710 t.Logf("Consumed %d/%d, duplicate deliveries: %d", received.count(), numMessages, deliveryCount-received.count()) 711 // Close connection and resume consuming on a different one 712 resetDurableConsumer() 713 } 714 } 715 716 resetDurableConsumer() 717 718 chaos.start() 719 defer chaos.stop() 720 721 // Wait for long enough silence. 722 // Either a stall, or all messages received 723 <-timeoutTimer.C 724 725 // Shut down consumer 726 if sub != nil { 727 sub.Unsubscribe() 728 } 729 730 uniqueDeliveredCount := received.count() 731 732 t.Logf( 733 "Delivered %d/%d messages %d duplicate deliveries", 734 uniqueDeliveredCount, 735 numMessages, 736 deliveryCount-uniqueDeliveredCount, 737 ) 738 739 if uniqueDeliveredCount != numMessages { 740 t.Fatalf("No new message delivered in the last %s, %d/%d messages never delivered", timeout, numMessages-uniqueDeliveredCount, numMessages) 741 } 742 } 743 744 func TestJetStreamChaosConsumerPull(t *testing.T) { 745 746 const numMessages = 5_000 747 const numBatch = 500 748 const maxRetries = 100 749 const retryDelay = 500 * time.Millisecond 750 const fetchTimeout = 250 * time.Millisecond 751 const fetchBatchSize = 100 752 const clusterSize = 3 753 const replicas = 3 754 const durableConsumerName = "durable" 755 756 c := createJetStreamClusterExplicit(t, chaosConsumerTestsClusterName, clusterSize) 757 defer c.shutdown() 758 759 createStreamForConsumerChaosTest(t, c, replicas, numMessages) 760 761 chaos := createClusterChaosMonkeyController( 762 t, 763 c, 764 &clusterBouncerChaosMonkey{ 765 minDowntime: 0 * time.Second, 766 maxDowntime: 2 * time.Second, 767 minDownServers: clusterSize, // Whole cluster outage 768 maxDownServers: clusterSize, 769 pause: 1 * time.Second, 770 }, 771 ) 772 773 subNc, subJs := jsClientConnectCluster(t, c) 774 defer subNc.Close() 775 776 sub, err := subJs.PullSubscribe( 777 chaosConsumerTestsSubject, 778 durableConsumerName, 779 ) 780 if err != nil { 781 t.Fatalf("Unexpected error: %v", err) 782 } 783 defer sub.Unsubscribe() 784 785 if chaosConsumerTestsDebug { 786 t.Logf("Initial subscription: %s", toIndentedJsonString(sub)) 787 } 788 789 chaos.start() 790 defer chaos.stop() 791 792 fetchMaxWait := nats.MaxWait(fetchTimeout) 793 received := NewBitset(numMessages) 794 deliveredCount := uint64(0) 795 796 for received.count() < numMessages { 797 798 var msgs []*nats.Msg 799 var fetchErr error 800 801 fetchRetryLoop: 802 for r := 0; r <= maxRetries; r++ { 803 msgs, fetchErr = sub.Fetch(fetchBatchSize, fetchMaxWait) 804 if fetchErr == nil { 805 break fetchRetryLoop 806 } else if r == maxRetries { 807 t.Fatalf("Exceeded max retries for Fetch, last error: %s", fetchErr) 808 } else if fetchErr == nats.ErrBadSubscription { 809 t.Fatalf("Subscription is invalid: %s", toIndentedJsonString(sub)) 810 } else { 811 // t.Logf("Fetch error: %v", fetchErr) 812 time.Sleep(retryDelay) 813 } 814 } 815 816 for _, msg := range msgs { 817 818 deliveredCount += 1 819 820 metadata, err := msg.Metadata() 821 if err != nil { 822 t.Fatalf("Failed to get message metadata: %v", err) 823 } 824 825 streamSeq := metadata.Sequence.Stream 826 827 expectedMsgData := []byte(fmt.Sprintf("%d", streamSeq)) 828 829 if !bytes.Equal(msg.Data, expectedMsgData) { 830 t.Fatalf("Expecting message %s, got %s instead", expectedMsgData, msg.Data) 831 } 832 833 isDupe := received.get(streamSeq - 1) 834 835 received.set(streamSeq-1, true) 836 837 // Simulate application processing (and gives the monkey some time to brew chaos) 838 time.Sleep(10 * time.Millisecond) 839 840 ackRetryLoop: 841 for r := 0; r <= maxRetries; r++ { 842 ackErr := msg.Ack() 843 if ackErr == nil { 844 break ackRetryLoop 845 } else if r == maxRetries { 846 t.Fatalf("Failed to ACK message %d, last error: %s", streamSeq, ackErr) 847 } else { 848 time.Sleep(retryDelay) 849 } 850 } 851 852 if !isDupe && received.count()%numBatch == 0 { 853 t.Logf("Consumed %d/%d (duplicates: %d)", received.count(), numMessages, deliveredCount-received.count()) 854 } 855 } 856 } 857 } 858 859 const ( 860 chaosKvTestsClusterName = "KV_CHAOS_TEST" 861 chaosKvTestsBucketName = "KV_CHAOS_TEST_BUCKET" 862 chaosKvTestsSubject = "foo" 863 chaosKvTestsDebug = false 864 ) 865 866 // Creates KV store (a.k.a. bucket). 867 func createBucketForKvChaosTest(t *testing.T, c *cluster, replicas int) { 868 t.Helper() 869 870 pubNc, pubJs := jsClientConnectCluster(t, c) 871 defer pubNc.Close() 872 873 config := nats.KeyValueConfig{ 874 Bucket: chaosKvTestsBucketName, 875 Replicas: replicas, 876 Description: "Test bucket", 877 } 878 879 kvs, err := pubJs.CreateKeyValue(&config) 880 if err != nil { 881 t.Fatalf("Error creating bucket: %v", err) 882 } 883 884 status, err := kvs.Status() 885 if err != nil { 886 t.Fatalf("Error retrieving bucket status: %v", err) 887 } 888 t.Logf("Bucket created: %s", status.Bucket()) 889 } 890 891 // Single client performs a set of PUT on a single key. 892 // If PUT is successful, perform a GET on the same key. 893 // If GET is successful, ensure key revision and value match the most recent successful write. 894 func TestJetStreamChaosKvPutGet(t *testing.T) { 895 896 const numOps = 100_000 897 const clusterSize = 3 898 const replicas = 3 899 const key = "key" 900 const staleReadsOk = true // Set to false to check for violations of 'read committed' consistency 901 902 c := createJetStreamClusterExplicit(t, chaosKvTestsClusterName, clusterSize) 903 defer c.shutdown() 904 905 createBucketForKvChaosTest(t, c, replicas) 906 907 chaos := createClusterChaosMonkeyController( 908 t, 909 c, 910 &clusterBouncerChaosMonkey{ 911 minDowntime: 0 * time.Second, 912 maxDowntime: 2 * time.Second, 913 minDownServers: clusterSize, // Whole cluster outage 914 maxDownServers: clusterSize, 915 pause: 1 * time.Second, 916 }, 917 ) 918 919 nc, js := jsClientConnectCluster(t, c) 920 defer nc.Close() 921 922 // Create KV bucket 923 kv, err := js.KeyValue(chaosKvTestsBucketName) 924 if err != nil { 925 t.Fatalf("Failed to get KV store: %v", err) 926 } 927 928 // Initialize the only key 929 firstRevision, err := kv.Create(key, []byte("INITIAL VALUE")) 930 if err != nil { 931 t.Fatalf("Failed to create key: %v", err) 932 } else if firstRevision != 1 { 933 t.Fatalf("Unexpected revision: %d", firstRevision) 934 } 935 936 // Start chaos 937 chaos.start() 938 defer chaos.stop() 939 940 staleReadsCount := uint64(0) 941 successCount := uint64(0) 942 943 previousRevision := firstRevision 944 945 putGetLoop: 946 for i := 1; i <= numOps; i++ { 947 948 if i%1000 == 0 { 949 t.Logf("Completed %d/%d PUT+GET operations", i, numOps) 950 } 951 952 // PUT a value 953 putValue := fmt.Sprintf("value-%d", i) 954 putRevision, err := kv.Put(key, []byte(putValue)) 955 if err != nil { 956 t.Logf("PUT error: %v", err) 957 continue putGetLoop 958 } 959 960 // Check revision is monotonically increasing 961 if putRevision <= previousRevision { 962 t.Fatalf("PUT produced revision %d which is not greater than the previous successful PUT revision: %d", putRevision, previousRevision) 963 } 964 965 previousRevision = putRevision 966 967 // If PUT was successful, GET the same 968 kve, err := kv.Get(key) 969 if err == nats.ErrKeyNotFound { 970 t.Fatalf("GET key not found, but key does exists (last PUT revision: %d)", putRevision) 971 } else if err != nil { 972 t.Logf("GET error: %v", err) 973 continue putGetLoop 974 } 975 976 getValue := string(kve.Value()) 977 getRevision := kve.Revision() 978 979 if putRevision > getRevision { 980 // Stale read, violates 'read committed' consistency criteria 981 if !staleReadsOk { 982 t.Fatalf("PUT value %s (rev: %d) then read value %s (rev: %d)", putValue, putRevision, getValue, getRevision) 983 } else { 984 staleReadsCount += 1 985 } 986 } else if putRevision < getRevision { 987 // Returned revision is higher than any ever written, this should never happen 988 t.Fatalf("GET returned revision %d, but most recent expected revision is %d", getRevision, putRevision) 989 } else if putValue != getValue { 990 // Returned revision matches latest, but values do not, this should never happen 991 t.Fatalf("GET returned revision %d with value %s, but value %s was just committed for that same revision", getRevision, getValue, putValue) 992 } else { 993 // Get returned the latest revision/value 994 successCount += 1 995 if chaosKvTestsDebug { 996 t.Logf("PUT+GET %s=%s (rev: %d)", key, putValue, putRevision) 997 } 998 } 999 } 1000 1001 t.Logf("Completed %d PUT+GET cycles of which %d successful, %d GETs returned a stale value", numOps, successCount, staleReadsCount) 1002 } 1003 1004 // A variant TestJetStreamChaosKvPutGet where PUT is retried until successful, and GET is retried until it returns the latest known key revision. 1005 // This validates than a confirmed PUT value is never lost, and becomes eventually visible. 1006 func TestJetStreamChaosKvPutGetWithRetries(t *testing.T) { 1007 1008 const numOps = 10_000 1009 const maxRetries = 20 1010 const retryDelay = 100 * time.Millisecond 1011 const clusterSize = 3 1012 const replicas = 3 1013 const key = "key" 1014 1015 c := createJetStreamClusterExplicit(t, chaosKvTestsClusterName, clusterSize) 1016 defer c.shutdown() 1017 1018 createBucketForKvChaosTest(t, c, replicas) 1019 1020 chaos := createClusterChaosMonkeyController( 1021 t, 1022 c, 1023 &clusterBouncerChaosMonkey{ 1024 minDowntime: 0 * time.Second, 1025 maxDowntime: 2 * time.Second, 1026 minDownServers: clusterSize, // Whole cluster outage 1027 maxDownServers: clusterSize, 1028 pause: 1 * time.Second, 1029 }, 1030 ) 1031 1032 nc, js := jsClientConnectCluster(t, c) 1033 defer nc.Close() 1034 1035 kv, err := js.KeyValue(chaosKvTestsBucketName) 1036 if err != nil { 1037 t.Fatalf("Failed to get KV store: %v", err) 1038 } 1039 1040 // Initialize key value 1041 firstRevision, err := kv.Create(key, []byte("INITIAL VALUE")) 1042 if err != nil { 1043 t.Fatalf("Failed to create key: %v", err) 1044 } else if firstRevision != 1 { 1045 t.Fatalf("Unexpected revision: %d", firstRevision) 1046 } 1047 1048 // Start chaos 1049 chaos.start() 1050 defer chaos.stop() 1051 1052 staleReadCount := 0 1053 previousRevision := firstRevision 1054 1055 putGetLoop: 1056 for i := 1; i <= numOps; i++ { 1057 1058 if i%1000 == 0 { 1059 t.Logf("Completed %d/%d PUT+GET operations", i, numOps) 1060 } 1061 1062 putValue := fmt.Sprintf("value-%d", i) 1063 putRevision := uint64(0) 1064 1065 // Put new value for key, retry until successful or out of retries 1066 putRetryLoop: 1067 for r := 0; r <= maxRetries; r++ { 1068 var putErr error 1069 putRevision, putErr = kv.Put(key, []byte(putValue)) 1070 if putErr == nil { 1071 break putRetryLoop 1072 } else if r == maxRetries { 1073 t.Fatalf("Failed to PUT (retried %d times): %v", maxRetries, putErr) 1074 } else { 1075 if chaosKvTestsDebug { 1076 t.Logf("PUT error: %v", putErr) 1077 } 1078 time.Sleep(retryDelay) 1079 } 1080 } 1081 1082 // Ensure key version is monotonically increasing 1083 if putRevision <= previousRevision { 1084 t.Fatalf("Latest PUT created revision %d which is not greater than the previous revision: %d", putRevision, previousRevision) 1085 } 1086 previousRevision = putRevision 1087 1088 // Read value for key, retry until successful, and validate corresponding version and value 1089 getRetryLoop: 1090 for r := 0; r <= maxRetries; r++ { 1091 var getErr error 1092 kve, getErr := kv.Get(key) 1093 if getErr != nil && r == maxRetries { 1094 t.Fatalf("Failed to GET (retried %d times): %v", maxRetries, getErr) 1095 } else if getErr != nil { 1096 if chaosKvTestsDebug { 1097 t.Logf("GET error: %v", getErr) 1098 } 1099 time.Sleep(retryDelay) 1100 continue getRetryLoop 1101 } 1102 1103 // GET successful, check revision and value 1104 getValue := string(kve.Value()) 1105 getRevision := kve.Revision() 1106 1107 if putRevision == getRevision { 1108 if putValue != getValue { 1109 t.Fatalf("Unexpected value %s for revision %d, expected: %s", getValue, getRevision, putValue) 1110 } 1111 if chaosKvTestsDebug { 1112 t.Logf("PUT+GET %s=%s (rev: %d) (retry: %d)", key, putValue, putRevision, r) 1113 } 1114 continue putGetLoop 1115 } else if getRevision > putRevision { 1116 t.Fatalf("GET returned version that should not exist yet: %d, last created: %d", getRevision, putRevision) 1117 } else { // get revision < put revision 1118 staleReadCount += 1 1119 if chaosKvTestsDebug { 1120 t.Logf("GET got stale value: %v (rev: %d, latest: %d)", getValue, getRevision, putRevision) 1121 } 1122 time.Sleep(retryDelay) 1123 continue getRetryLoop 1124 } 1125 } 1126 } 1127 1128 t.Logf("Client completed %d PUT+GET cycles, %d GET returned a stale value", numOps, staleReadCount) 1129 } 1130 1131 // Multiple clients updating a finite set of keys with CAS semantics. 1132 // TODO check that revision is never lower than last one seen 1133 // TODO check that KeyNotFound is never returned, as keys are initialized beforehand 1134 func TestJetStreamChaosKvCAS(t *testing.T) { 1135 const numOps = 10_000 1136 const maxRetries = 50 1137 const retryDelay = 300 * time.Millisecond 1138 const clusterSize = 3 1139 const replicas = 3 1140 const numKeys = 15 1141 const numClients = 5 1142 1143 c := createJetStreamClusterExplicit(t, chaosKvTestsClusterName, clusterSize) 1144 defer c.shutdown() 1145 1146 createBucketForKvChaosTest(t, c, replicas) 1147 1148 chaos := createClusterChaosMonkeyController( 1149 t, 1150 c, 1151 &clusterBouncerChaosMonkey{ 1152 minDowntime: 0 * time.Second, 1153 maxDowntime: 2 * time.Second, 1154 minDownServers: clusterSize, // Whole cluster outage 1155 maxDownServers: clusterSize, 1156 pause: 1 * time.Second, 1157 }, 1158 ) 1159 1160 nc, js := jsClientConnectCluster(t, c) 1161 defer nc.Close() 1162 1163 // Create bucket 1164 kv, err := js.KeyValue(chaosKvTestsBucketName) 1165 if err != nil { 1166 t.Fatalf("Failed to get KV store: %v", err) 1167 } 1168 1169 // Create set of keys and initialize them with dummy value 1170 keys := make([]string, numKeys) 1171 for k := 0; k < numKeys; k++ { 1172 key := fmt.Sprintf("key-%d", k) 1173 keys[k] = key 1174 1175 _, err := kv.Create(key, []byte("Initial value")) 1176 if err != nil { 1177 t.Fatalf("Failed to create key: %v", err) 1178 } 1179 } 1180 1181 wgStart := sync.WaitGroup{} 1182 wgComplete := sync.WaitGroup{} 1183 1184 // Client routine 1185 client := func(clientId int, kv nats.KeyValue) { 1186 defer wgComplete.Done() 1187 1188 rng := rand.New(rand.NewSource(int64(clientId))) 1189 successfulUpdates := 0 1190 casRejectUpdates := 0 1191 otherUpdateErrors := 0 1192 1193 // Map to track last known revision for each of the keys 1194 knownRevisions := map[string]uint64{} 1195 for _, key := range keys { 1196 knownRevisions[key] = 0 1197 } 1198 1199 // Wait for all clients to reach this point before proceeding 1200 wgStart.Done() 1201 wgStart.Wait() 1202 1203 for i := 1; i <= numOps; i++ { 1204 1205 if i%1000 == 0 { 1206 t.Logf("Client %d completed %d/%d updates", clientId, i, numOps) 1207 } 1208 1209 // Pick random key from the set 1210 key := keys[rng.Intn(numKeys)] 1211 1212 // Prepare unique value to be written 1213 value := fmt.Sprintf("client: %d operation %d", clientId, i) 1214 1215 // Try to update a key with CAS 1216 newRevision, updateErr := kv.Update(key, []byte(value), knownRevisions[key]) 1217 if updateErr == nil { 1218 // Update successful 1219 knownRevisions[key] = newRevision 1220 successfulUpdates += 1 1221 if chaosKvTestsDebug { 1222 t.Logf("Client %d updated key %s, new revision: %d", clientId, key, newRevision) 1223 } 1224 } else if updateErr != nil && strings.Contains(fmt.Sprint(updateErr), "wrong last sequence") { 1225 // CAS rejected update, learn current revision for this key 1226 casRejectUpdates += 1 1227 1228 for r := 0; r <= maxRetries; r++ { 1229 kve, getErr := kv.Get(key) 1230 if getErr == nil { 1231 currentRevision := kve.Revision() 1232 if currentRevision < knownRevisions[key] { 1233 // Revision number moved backward, this should never happen 1234 t.Fatalf("Current revision for key %s is %d, which is lower than the last known revision %d", key, currentRevision, knownRevisions[key]) 1235 1236 } 1237 1238 knownRevisions[key] = currentRevision 1239 if chaosKvTestsDebug { 1240 t.Logf("Client %d learn key %s revision: %d", clientId, key, currentRevision) 1241 } 1242 break 1243 } else if r == maxRetries { 1244 t.Fatalf("Failed to GET (retried %d times): %v", maxRetries, getErr) 1245 } else { 1246 time.Sleep(retryDelay) 1247 } 1248 } 1249 } else { 1250 // Other update error 1251 otherUpdateErrors += 1 1252 if chaosKvTestsDebug { 1253 t.Logf("Client %d update error for key %s: %v", clientId, key, updateErr) 1254 } 1255 time.Sleep(retryDelay) 1256 } 1257 } 1258 t.Logf("Client %d done, %d kv updates, %d CAS rejected, %d other errors", clientId, successfulUpdates, casRejectUpdates, otherUpdateErrors) 1259 } 1260 1261 // Launch all clients 1262 for i := 1; i <= numClients; i++ { 1263 cNc, cJs := jsClientConnectCluster(t, c) 1264 defer cNc.Close() 1265 1266 cKv, err := cJs.KeyValue(chaosKvTestsBucketName) 1267 if err != nil { 1268 t.Fatalf("Failed to get KV store: %v", err) 1269 } 1270 1271 wgStart.Add(1) 1272 wgComplete.Add(1) 1273 go client(i, cKv) 1274 } 1275 1276 // Wait for clients to be connected and ready 1277 wgStart.Wait() 1278 1279 // Start failures 1280 chaos.start() 1281 defer chaos.stop() 1282 1283 // Wait for all clients to be done 1284 wgComplete.Wait() 1285 }