github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/jetstream_cluster_3_test.go (about) 1 // Copyright 2022-2024 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 //go:build !skip_js_tests && !skip_js_cluster_tests_3 15 // +build !skip_js_tests,!skip_js_cluster_tests_3 16 17 package server 18 19 import ( 20 "bytes" 21 "context" 22 "encoding/json" 23 "errors" 24 "fmt" 25 "math/rand" 26 "net" 27 "os" 28 "path/filepath" 29 "reflect" 30 "strings" 31 "sync" 32 "sync/atomic" 33 "testing" 34 "time" 35 36 "github.com/nats-io/jwt/v2" 37 "github.com/nats-io/nats.go" 38 ) 39 40 func TestJetStreamClusterRemovePeerByID(t *testing.T) { 41 c := createJetStreamClusterExplicit(t, "R3S", 3) 42 defer c.shutdown() 43 44 s := c.randomNonLeader() 45 nc, js := jsClientConnect(t, s) 46 defer nc.Close() 47 48 _, err := js.AddStream(&nats.StreamConfig{ 49 Name: "TEST", 50 Subjects: []string{"foo", "bar"}, 51 Replicas: 3, 52 }) 53 require_NoError(t, err) 54 55 // Wait for a leader 56 c.waitOnStreamLeader(globalAccountName, "TEST") 57 58 // Get the name of the one that is not restarted 59 srvName := c.opts[2].ServerName 60 // And its node ID 61 peerID := c.servers[2].Node() 62 63 nc.Close() 64 // Now stop the whole cluster 65 c.stopAll() 66 // Restart all but one 67 for i := 0; i < 2; i++ { 68 opts := c.opts[i] 69 s, o := RunServerWithConfig(opts.ConfigFile) 70 c.servers[i] = s 71 c.opts[i] = o 72 } 73 74 c.waitOnClusterReadyWithNumPeers(2) 75 c.waitOnStreamLeader(globalAccountName, "TEST") 76 77 // Now attempt to remove by name, this should fail because the cluster 78 // was restarted and names are not persisted. 79 ml := c.leader() 80 nc, err = nats.Connect(ml.ClientURL(), nats.UserInfo("admin", "s3cr3t!")) 81 require_NoError(t, err) 82 defer nc.Close() 83 84 req := &JSApiMetaServerRemoveRequest{Server: srvName} 85 jsreq, err := json.Marshal(req) 86 require_NoError(t, err) 87 rmsg, err := nc.Request(JSApiRemoveServer, jsreq, 2*time.Second) 88 require_NoError(t, err) 89 90 var resp JSApiMetaServerRemoveResponse 91 err = json.Unmarshal(rmsg.Data, &resp) 92 require_NoError(t, err) 93 require_True(t, resp.Error != nil) 94 require_True(t, IsNatsErr(resp.Error, JSClusterServerNotMemberErr)) 95 96 // Now try by ID, but first with an ID that does not match any peerID 97 req.Peer = "some_bad_id" 98 jsreq, err = json.Marshal(req) 99 require_NoError(t, err) 100 rmsg, err = nc.Request(JSApiRemoveServer, jsreq, 2*time.Second) 101 require_NoError(t, err) 102 103 resp = JSApiMetaServerRemoveResponse{} 104 err = json.Unmarshal(rmsg.Data, &resp) 105 require_NoError(t, err) 106 require_True(t, resp.Error != nil) 107 require_True(t, IsNatsErr(resp.Error, JSClusterServerNotMemberErr)) 108 109 // Now with the proper peer ID 110 req.Peer = peerID 111 jsreq, err = json.Marshal(req) 112 require_NoError(t, err) 113 rmsg, err = nc.Request(JSApiRemoveServer, jsreq, 2*time.Second) 114 require_NoError(t, err) 115 116 resp = JSApiMetaServerRemoveResponse{} 117 err = json.Unmarshal(rmsg.Data, &resp) 118 require_NoError(t, err) 119 require_True(t, resp.Error == nil) 120 require_True(t, resp.Success) 121 } 122 123 func TestJetStreamClusterDiscardNewAndMaxMsgsPerSubject(t *testing.T) { 124 c := createJetStreamClusterExplicit(t, "R3S", 3) 125 defer c.shutdown() 126 127 // Client for API requests. 128 s := c.randomNonLeader() 129 nc, js := jsClientConnect(t, s) 130 defer nc.Close() 131 132 for _, test := range []struct { 133 name string 134 storage StorageType 135 replicas int 136 }{ 137 {"MEM-R1", MemoryStorage, 1}, 138 {"FILE-R1", FileStorage, 1}, 139 {"MEM-R3", MemoryStorage, 3}, 140 {"FILE-R3", FileStorage, 3}, 141 } { 142 t.Run(test.name, func(t *testing.T) { 143 js.DeleteStream("KV") 144 // Make sure setting new without DiscardPolicy also being new is error. 145 cfg := &StreamConfig{ 146 Name: "KV", 147 Subjects: []string{"KV.>"}, 148 Storage: test.storage, 149 AllowDirect: true, 150 DiscardNewPer: true, 151 MaxMsgs: 10, 152 Replicas: test.replicas, 153 } 154 if _, apiErr := addStreamWithError(t, nc, cfg); apiErr == nil { 155 t.Fatalf("Expected API error but got none") 156 } else if apiErr.ErrCode != 10052 || !strings.Contains(apiErr.Description, "discard new per subject requires discard new policy") { 157 t.Fatalf("Got wrong error: %+v", apiErr) 158 } 159 160 // Set broad discard new policy to engage DiscardNewPer 161 cfg.Discard = DiscardNew 162 // We should also error here since we have not setup max msgs per subject. 163 if _, apiErr := addStreamWithError(t, nc, cfg); apiErr == nil { 164 t.Fatalf("Expected API error but got none") 165 } else if apiErr.ErrCode != 10052 || !strings.Contains(apiErr.Description, "discard new per subject requires max msgs per subject > 0") { 166 t.Fatalf("Got wrong error: %+v", apiErr) 167 } 168 169 cfg.MaxMsgsPer = 1 170 addStream(t, nc, cfg) 171 172 // We want to test that we reject new messages on a per subject basis if the 173 // max msgs per subject limit has been hit, even if other limits have not. 174 _, err := js.Publish("KV.foo", nil) 175 require_NoError(t, err) 176 177 _, err = js.Publish("KV.foo", nil) 178 // Go client does not have const for this one. 179 require_Error(t, err, errors.New("nats: maximum messages per subject exceeded")) 180 }) 181 } 182 } 183 184 func TestJetStreamClusterCreateConsumerWithReplicaOneGetsResponse(t *testing.T) { 185 c := createJetStreamClusterExplicit(t, "R3S", 3) 186 defer c.shutdown() 187 188 s := c.randomNonLeader() 189 nc, js := jsClientConnect(t, s) 190 defer nc.Close() 191 192 _, err := js.AddStream(&nats.StreamConfig{ 193 Name: "TEST", 194 Subjects: []string{"foo"}, 195 Replicas: 3, 196 }) 197 require_NoError(t, err) 198 199 c.waitOnStreamLeader(globalAccountName, "TEST") 200 201 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 202 Durable: "C3", 203 AckPolicy: nats.AckExplicitPolicy, 204 }) 205 require_NoError(t, err) 206 207 c.waitOnConsumerLeader(globalAccountName, "TEST", "C3") 208 209 // Update to scale down to R1, that should work (get a response) 210 _, err = js.UpdateConsumer("TEST", &nats.ConsumerConfig{ 211 Durable: "C3", 212 AckPolicy: nats.AckExplicitPolicy, 213 Replicas: 1, 214 }) 215 require_NoError(t, err) 216 217 c.waitOnConsumerLeader(globalAccountName, "TEST", "C3") 218 219 ci, err := js.ConsumerInfo("TEST", "C3") 220 require_NoError(t, err) 221 require_True(t, ci.Config.Replicas == 1) 222 require_True(t, len(ci.Cluster.Replicas) == 0) 223 } 224 225 func TestJetStreamClusterMetaRecoveryLogic(t *testing.T) { 226 c := createJetStreamClusterExplicit(t, "R3S", 3) 227 defer c.shutdown() 228 229 s := c.randomNonLeader() 230 nc, js := jsClientConnect(t, s) 231 defer nc.Close() 232 233 _, err := js.AddStream(&nats.StreamConfig{ 234 Name: "TEST", 235 Subjects: []string{"foo"}, 236 Replicas: 3, 237 }) 238 require_NoError(t, err) 239 240 _, err = js.UpdateStream(&nats.StreamConfig{ 241 Name: "TEST", 242 Subjects: []string{"foo", "bar"}, 243 Replicas: 1, 244 }) 245 require_NoError(t, err) 246 247 err = js.DeleteStream("TEST") 248 require_NoError(t, err) 249 250 _, err = js.AddStream(&nats.StreamConfig{ 251 Name: "TEST", 252 Subjects: []string{"foo"}, 253 Replicas: 3, 254 }) 255 require_NoError(t, err) 256 257 err = js.DeleteStream("TEST") 258 require_NoError(t, err) 259 260 _, err = js.AddStream(&nats.StreamConfig{ 261 Name: "TEST", 262 Subjects: []string{"baz"}, 263 Replicas: 1, 264 }) 265 require_NoError(t, err) 266 267 osi, err := js.StreamInfo("TEST") 268 require_NoError(t, err) 269 270 c.stopAll() 271 c.restartAll() 272 c.waitOnLeader() 273 c.waitOnStreamLeader("$G", "TEST") 274 275 s = c.randomNonLeader() 276 nc, js = jsClientConnect(t, s) 277 defer nc.Close() 278 279 si, err := js.StreamInfo("TEST") 280 require_NoError(t, err) 281 282 if !reflect.DeepEqual(si.Config, osi.Config) { 283 t.Fatalf("Expected %+v, but got %+v", osi.Config, si.Config) 284 } 285 } 286 287 func TestJetStreamClusterDeleteConsumerWhileServerDown(t *testing.T) { 288 c := createJetStreamClusterExplicit(t, "R3S", 3) 289 defer c.shutdown() 290 291 nc, js := jsClientConnect(t, c.randomNonLeader()) 292 defer nc.Close() 293 294 _, err := js.AddStream(&nats.StreamConfig{ 295 Name: "TEST", 296 Subjects: []string{"foo"}, 297 Replicas: 3, 298 }) 299 require_NoError(t, err) 300 301 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 302 Durable: "DC", 303 AckPolicy: nats.AckExplicitPolicy, 304 Replicas: 3, 305 }) 306 require_NoError(t, err) 307 308 s := c.randomNonConsumerLeader("$G", "TEST", "DC") 309 s.Shutdown() 310 311 c.waitOnLeader() // In case that was metaleader. 312 nc, js = jsClientConnect(t, c.randomNonLeader()) // In case we were connected there. 313 defer nc.Close() 314 315 err = js.DeleteConsumer("TEST", "DC") 316 require_NoError(t, err) 317 318 // Restart. 319 s = c.restartServer(s) 320 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 321 hs := s.healthz(&HealthzOptions{ 322 JSEnabledOnly: false, 323 JSServerOnly: false, 324 }) 325 if hs.Error != _EMPTY_ { 326 return errors.New(hs.Error) 327 } 328 return nil 329 }) 330 331 // Make sure we can not see it on the server that was down at the time of delete. 332 mset, err := s.GlobalAccount().lookupStream("TEST") 333 require_NoError(t, err) 334 335 if o := mset.lookupConsumer("DC"); o != nil { 336 t.Fatalf("Expected to not find consumer, but did") 337 } 338 339 // Now repeat but force a meta snapshot. 340 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 341 Durable: "DC", 342 AckPolicy: nats.AckExplicitPolicy, 343 Replicas: 3, 344 }) 345 require_NoError(t, err) 346 347 s = c.randomNonConsumerLeader("$G", "TEST", "DC") 348 s.Shutdown() 349 350 c.waitOnLeader() // In case that was metaleader. 351 nc, js = jsClientConnect(t, c.randomNonLeader()) // In case we were connected there. 352 defer nc.Close() 353 354 err = js.DeleteConsumer("TEST", "DC") 355 require_NoError(t, err) 356 357 err = c.leader().JetStreamSnapshotMeta() 358 require_NoError(t, err) 359 360 // Restart. 361 s = c.restartServer(s) 362 checkFor(t, time.Second*2, 200*time.Millisecond, func() error { 363 hs := s.healthz(&HealthzOptions{ 364 JSEnabledOnly: false, 365 JSServerOnly: false, 366 }) 367 if hs.Error != _EMPTY_ { 368 return errors.New(hs.Error) 369 } 370 return nil 371 }) 372 373 // Make sure we can not see it on the server that was down at the time of delete. 374 mset, err = s.GlobalAccount().lookupStream("TEST") 375 require_NoError(t, err) 376 377 if o := mset.lookupConsumer("DC"); o != nil { 378 t.Fatalf("Expected to not find consumer, but did") 379 } 380 } 381 382 func TestJetStreamClusterNegativeReplicas(t *testing.T) { 383 s := RunBasicJetStreamServer(t) 384 defer s.Shutdown() 385 386 c := createJetStreamClusterExplicit(t, "R3S", 3) 387 defer c.shutdown() 388 389 testBadReplicas := func(t *testing.T, s *Server, name string) { 390 nc, js := jsClientConnect(t, s) 391 defer nc.Close() 392 393 _, err := js.AddStream(&nats.StreamConfig{ 394 Name: name, 395 Replicas: -1, 396 }) 397 require_Error(t, err, NewJSReplicasCountCannotBeNegativeError()) 398 399 _, err = js.AddStream(&nats.StreamConfig{ 400 Name: name, 401 Replicas: 1, 402 }) 403 require_NoError(t, err) 404 405 // Check update now. 406 _, err = js.UpdateStream(&nats.StreamConfig{ 407 Name: name, 408 Replicas: -11, 409 }) 410 require_Error(t, err, NewJSReplicasCountCannotBeNegativeError()) 411 412 // Now same for consumers 413 durName := fmt.Sprintf("%s_dur", name) 414 _, err = js.AddConsumer(name, &nats.ConsumerConfig{ 415 Durable: durName, 416 Replicas: -1, 417 }) 418 require_Error(t, err, NewJSReplicasCountCannotBeNegativeError()) 419 420 _, err = js.AddConsumer(name, &nats.ConsumerConfig{ 421 Durable: durName, 422 Replicas: 1, 423 }) 424 require_NoError(t, err) 425 426 // Check update now 427 _, err = js.UpdateConsumer(name, &nats.ConsumerConfig{ 428 Durable: durName, 429 Replicas: -11, 430 }) 431 require_Error(t, err, NewJSReplicasCountCannotBeNegativeError()) 432 } 433 434 t.Run("Standalone", func(t *testing.T) { testBadReplicas(t, s, "TEST1") }) 435 t.Run("Clustered", func(t *testing.T) { testBadReplicas(t, c.randomServer(), "TEST2") }) 436 } 437 438 func TestJetStreamClusterUserGivenConsName(t *testing.T) { 439 s := RunBasicJetStreamServer(t) 440 defer s.Shutdown() 441 442 c := createJetStreamClusterExplicit(t, "R3S", 3) 443 defer c.shutdown() 444 445 test := func(t *testing.T, s *Server, stream string, replicas int, cons string) { 446 nc, js := jsClientConnect(t, s) 447 defer nc.Close() 448 449 _, err := js.AddStream(&nats.StreamConfig{ 450 Name: stream, 451 Replicas: replicas, 452 }) 453 require_NoError(t, err) 454 455 cc := &CreateConsumerRequest{ 456 Stream: stream, 457 Config: ConsumerConfig{ 458 Name: cons, 459 FilterSubject: stream, 460 InactiveThreshold: 10 * time.Second, 461 }, 462 } 463 subj := fmt.Sprintf(JSApiConsumerCreateExT, stream, cons, stream) 464 req, err := json.Marshal(cc) 465 require_NoError(t, err) 466 467 reply, err := nc.Request(subj, req, 2*time.Second) 468 require_NoError(t, err) 469 470 var cresp JSApiConsumerCreateResponse 471 json.Unmarshal(reply.Data, &cresp) 472 if cresp.Error != nil { 473 t.Fatalf("Unexpected error: %v", cresp.Error) 474 } 475 require_Equal(t, cresp.Name, cons) 476 require_Equal(t, cresp.Config.Name, cons) 477 478 // Resend the add request but before change something that the server 479 // should reject since the consumer already exist and we don't support 480 // the update of the consumer that way. 481 cc.Config.DeliverPolicy = DeliverNew 482 req, err = json.Marshal(cc) 483 require_NoError(t, err) 484 reply, err = nc.Request(subj, req, 2*time.Second) 485 require_NoError(t, err) 486 487 cresp = JSApiConsumerCreateResponse{} 488 json.Unmarshal(reply.Data, &cresp) 489 require_Error(t, cresp.Error, NewJSConsumerCreateError(errors.New("deliver policy can not be updated"))) 490 } 491 492 t.Run("Standalone", func(t *testing.T) { test(t, s, "TEST", 1, "cons") }) 493 t.Run("Clustered R1", func(t *testing.T) { test(t, c.randomServer(), "TEST2", 1, "cons2") }) 494 t.Run("Clustered R3", func(t *testing.T) { test(t, c.randomServer(), "TEST3", 3, "cons3") }) 495 } 496 497 func TestJetStreamClusterUserGivenConsNameWithLeaderChange(t *testing.T) { 498 c := createJetStreamClusterExplicit(t, "R5S", 5) 499 defer c.shutdown() 500 501 nc, js := jsClientConnect(t, c.randomServer()) 502 defer nc.Close() 503 504 _, err := js.AddStream(&nats.StreamConfig{ 505 Name: "TEST", 506 Subjects: []string{"foo"}, 507 Replicas: 3, 508 }) 509 require_NoError(t, err) 510 511 c.waitOnStreamLeader(globalAccountName, "TEST") 512 for i := 0; i < 100; i++ { 513 sendStreamMsg(t, nc, "foo", "msg") 514 } 515 516 consName := "myephemeral" 517 cc := &CreateConsumerRequest{ 518 Stream: "TEST", 519 Config: ConsumerConfig{ 520 Name: consName, 521 FilterSubject: "foo", 522 InactiveThreshold: time.Hour, 523 Replicas: 3, 524 }, 525 } 526 subj := fmt.Sprintf(JSApiConsumerCreateExT, "TEST", consName, "foo") 527 req, err := json.Marshal(cc) 528 require_NoError(t, err) 529 530 reply, err := nc.Request(subj, req, 2*time.Second) 531 require_NoError(t, err) 532 533 var cresp JSApiConsumerCreateResponse 534 json.Unmarshal(reply.Data, &cresp) 535 if cresp.Error != nil { 536 t.Fatalf("Unexpected error: %v", cresp.Error) 537 } 538 require_Equal(t, cresp.Name, consName) 539 require_Equal(t, cresp.Config.Name, consName) 540 541 // Consumer leader name 542 clname := cresp.ConsumerInfo.Cluster.Leader 543 544 nreq := &JSApiConsumerGetNextRequest{Batch: 1, Expires: time.Second} 545 req, err = json.Marshal(nreq) 546 require_NoError(t, err) 547 548 sub := natsSubSync(t, nc, "xxx") 549 rsubj := fmt.Sprintf(JSApiRequestNextT, "TEST", consName) 550 err = nc.PublishRequest(rsubj, "xxx", req) 551 require_NoError(t, err) 552 553 msg := natsNexMsg(t, sub, time.Second) 554 require_Equal(t, string(msg.Data), "msg") 555 556 // Shutdown the consumer leader 557 cl := c.serverByName(clname) 558 cl.Shutdown() 559 560 // Wait for a bit to be sure that we lost leadership 561 time.Sleep(250 * time.Millisecond) 562 563 // Wait for new leader 564 c.waitOnStreamLeader(globalAccountName, "TEST") 565 c.waitOnConsumerLeader(globalAccountName, "TEST", consName) 566 567 // Make sure we can still consume. 568 for i := 0; i < 2; i++ { 569 err = nc.PublishRequest(rsubj, "xxx", req) 570 require_NoError(t, err) 571 572 msg = natsNexMsg(t, sub, time.Second) 573 if len(msg.Data) == 0 { 574 continue 575 } 576 require_Equal(t, string(msg.Data), "msg") 577 return 578 } 579 t.Fatal("Did not receive message") 580 } 581 582 func TestJetStreamClusterMirrorCrossDomainOnLeadnodeNoSystemShare(t *testing.T) { 583 tmpl := strings.Replace(jsClusterAccountsTempl, "store_dir:", "domain: HUB, store_dir:", 1) 584 c := createJetStreamCluster(t, tmpl, "CORE", _EMPTY_, 3, 18033, true) 585 defer c.shutdown() 586 587 tmpl = strings.Replace(jsClusterTemplWithSingleLeafNode, "store_dir:", "domain: SPOKE, store_dir:", 1) 588 ln := c.createLeafNodeWithTemplateNoSystem("LN-SPOKE", tmpl) 589 defer ln.Shutdown() 590 591 checkLeafNodeConnectedCount(t, ln, 1) 592 593 // Create origin stream in hub. 594 nc, js := jsClientConnect(t, c.randomServer()) 595 defer nc.Close() 596 597 _, err := js.AddStream(&nats.StreamConfig{ 598 Name: "TEST", 599 Subjects: []string{"foo"}, 600 MaxMsgsPerSubject: 10, 601 AllowDirect: true, 602 }) 603 require_NoError(t, err) 604 605 // Now create the mirror on the leafnode. 606 lnc, ljs := jsClientConnect(t, ln) 607 defer lnc.Close() 608 609 _, err = ljs.AddStream(&nats.StreamConfig{ 610 Name: "M", 611 MaxMsgsPerSubject: 10, 612 AllowDirect: true, 613 MirrorDirect: true, 614 Mirror: &nats.StreamSource{ 615 Name: "TEST", 616 External: &nats.ExternalStream{ 617 APIPrefix: "$JS.HUB.API", 618 }, 619 }, 620 }) 621 require_NoError(t, err) 622 623 // Publish to the hub stream and make sure the mirror gets those messages. 624 for i := 0; i < 20; i++ { 625 js.Publish("foo", nil) 626 } 627 628 si, err := js.StreamInfo("TEST") 629 require_NoError(t, err) 630 require_True(t, si.State.Msgs == 10) 631 632 checkFor(t, time.Second, 200*time.Millisecond, func() error { 633 si, err := ljs.StreamInfo("M") 634 require_NoError(t, err) 635 if si.State.Msgs == 10 { 636 return nil 637 } 638 return fmt.Errorf("State not current: %+v", si.State) 639 }) 640 } 641 642 func TestJetStreamClusterFirstSeqMismatch(t *testing.T) { 643 c := createJetStreamClusterWithTemplateAndModHook(t, jsClusterTempl, "C", 3, 644 func(serverName, clusterName, storeDir, conf string) string { 645 tf := createTempFile(t, "") 646 logName := tf.Name() 647 tf.Close() 648 return fmt.Sprintf("%s\nlogfile: '%s'", conf, logName) 649 }) 650 defer c.shutdown() 651 652 rs := c.randomServer() 653 nc, js := jsClientConnect(t, rs) 654 defer nc.Close() 655 656 _, err := js.AddStream(&nats.StreamConfig{ 657 Name: "TEST", 658 Subjects: []string{"foo"}, 659 Replicas: 3, 660 MaxAge: 2 * time.Second, 661 }) 662 require_NoError(t, err) 663 664 c.waitOnStreamLeader(globalAccountName, "TEST") 665 666 mset, err := c.streamLeader(globalAccountName, "TEST").GlobalAccount().lookupStream("TEST") 667 require_NoError(t, err) 668 node := mset.raftNode() 669 670 nl := c.randomNonStreamLeader(globalAccountName, "TEST") 671 if rs == nl { 672 nc.Close() 673 for _, s := range c.servers { 674 if s != nl { 675 nc, _ = jsClientConnect(t, s) 676 defer nc.Close() 677 break 678 } 679 } 680 } 681 682 wg := sync.WaitGroup{} 683 wg.Add(1) 684 ch := make(chan struct{}) 685 go func() { 686 defer wg.Done() 687 for i := 0; ; i++ { 688 sendStreamMsg(t, nc, "foo", "msg") 689 select { 690 case <-ch: 691 return 692 default: 693 } 694 } 695 }() 696 697 time.Sleep(2500 * time.Millisecond) 698 nl.Shutdown() 699 700 time.Sleep(500 * time.Millisecond) 701 node.InstallSnapshot(mset.stateSnapshot()) 702 time.Sleep(3500 * time.Millisecond) 703 704 c.restartServer(nl) 705 c.waitOnAllCurrent() 706 707 close(ch) 708 wg.Wait() 709 710 log := nl.getOpts().LogFile 711 nl.Shutdown() 712 713 content, err := os.ReadFile(log) 714 require_NoError(t, err) 715 if bytes.Contains(content, []byte(errFirstSequenceMismatch.Error())) { 716 t.Fatalf("First sequence mismatch occurred!") 717 } 718 } 719 720 func TestJetStreamClusterConsumerInactiveThreshold(t *testing.T) { 721 // Create a standalone, a cluster, and a super cluster 722 723 s := RunBasicJetStreamServer(t) 724 defer s.Shutdown() 725 726 c := createJetStreamClusterExplicit(t, "R3S", 3) 727 defer c.shutdown() 728 729 sc := createJetStreamSuperCluster(t, 3, 2) 730 defer sc.shutdown() 731 732 test := func(t *testing.T, c *cluster, s *Server, replicas int) { 733 if c != nil { 734 s = c.randomServer() 735 } 736 nc, js := jsClientConnect(t, s) 737 defer nc.Close() 738 739 sname := fmt.Sprintf("TEST%d", replicas) 740 _, err := js.AddStream(&nats.StreamConfig{ 741 Name: sname, 742 Subjects: []string{sname}, 743 Replicas: replicas, 744 }) 745 require_NoError(t, err) 746 747 if c != nil { 748 c.waitOnStreamLeader(globalAccountName, sname) 749 } 750 751 for i := 0; i < 10; i++ { 752 js.PublishAsync(sname, []byte("ok")) 753 } 754 select { 755 case <-js.PublishAsyncComplete(): 756 case <-time.After(5 * time.Second): 757 t.Fatalf("Did not receive completion signal") 758 } 759 760 waitOnCleanup := func(ci *nats.ConsumerInfo) { 761 t.Helper() 762 checkFor(t, 2*time.Second, 50*time.Millisecond, func() error { 763 _, err := js.ConsumerInfo(ci.Stream, ci.Name) 764 if err == nil { 765 return fmt.Errorf("Consumer still present") 766 } 767 return nil 768 }) 769 } 770 771 // Test to make sure inactive threshold is enforced for all types. 772 // Ephemeral and Durable, both push and pull. 773 774 // Ephemeral Push (no bind to deliver subject) 775 ci, err := js.AddConsumer(sname, &nats.ConsumerConfig{ 776 DeliverSubject: "_no_bind_", 777 InactiveThreshold: 50 * time.Millisecond, 778 }) 779 require_NoError(t, err) 780 waitOnCleanup(ci) 781 782 // Ephemeral Pull 783 ci, err = js.AddConsumer(sname, &nats.ConsumerConfig{ 784 AckPolicy: nats.AckExplicitPolicy, 785 InactiveThreshold: 50 * time.Millisecond, 786 }) 787 require_NoError(t, err) 788 waitOnCleanup(ci) 789 790 // Support InactiveThresholds for Durables as well. 791 792 // Durable Push (no bind to deliver subject) 793 ci, err = js.AddConsumer(sname, &nats.ConsumerConfig{ 794 Durable: "d1", 795 DeliverSubject: "_no_bind_", 796 InactiveThreshold: 50 * time.Millisecond, 797 }) 798 require_NoError(t, err) 799 waitOnCleanup(ci) 800 801 // Durable Push (no bind to deliver subject) with an activity 802 // threshold set after creation 803 ci, err = js.AddConsumer(sname, &nats.ConsumerConfig{ 804 Durable: "d2", 805 DeliverSubject: "_no_bind_", 806 }) 807 require_NoError(t, err) 808 if c != nil { 809 c.waitOnConsumerLeader(globalAccountName, sname, "d2") 810 } 811 _, err = js.UpdateConsumer(sname, &nats.ConsumerConfig{ 812 Durable: "d2", 813 DeliverSubject: "_no_bind_", 814 InactiveThreshold: 50 * time.Millisecond, 815 }) 816 require_NoError(t, err) 817 waitOnCleanup(ci) 818 819 // Durable Pull 820 ci, err = js.AddConsumer(sname, &nats.ConsumerConfig{ 821 Durable: "d3", 822 AckPolicy: nats.AckExplicitPolicy, 823 InactiveThreshold: 50 * time.Millisecond, 824 }) 825 require_NoError(t, err) 826 waitOnCleanup(ci) 827 828 // Durable Pull with an inactivity threshold set after creation 829 ci, err = js.AddConsumer(sname, &nats.ConsumerConfig{ 830 Durable: "d4", 831 AckPolicy: nats.AckExplicitPolicy, 832 }) 833 require_NoError(t, err) 834 if c != nil { 835 c.waitOnConsumerLeader(globalAccountName, sname, "d4") 836 } 837 _, err = js.UpdateConsumer(sname, &nats.ConsumerConfig{ 838 Durable: "d4", 839 AckPolicy: nats.AckExplicitPolicy, 840 InactiveThreshold: 50 * time.Millisecond, 841 }) 842 require_NoError(t, err) 843 waitOnCleanup(ci) 844 } 845 846 t.Run("standalone", func(t *testing.T) { test(t, nil, s, 1) }) 847 t.Run("cluster-r1", func(t *testing.T) { test(t, c, nil, 1) }) 848 t.Run("cluster-r3", func(t *testing.T) { test(t, c, nil, 3) }) 849 t.Run("super-cluster-r1", func(t *testing.T) { test(t, sc.randomCluster(), nil, 1) }) 850 t.Run("super-cluster-r3", func(t *testing.T) { test(t, sc.randomCluster(), nil, 3) }) 851 } 852 853 // To capture our false warnings for clustered stream lag. 854 type testStreamLagWarnLogger struct { 855 DummyLogger 856 ch chan string 857 } 858 859 func (l *testStreamLagWarnLogger) Warnf(format string, v ...any) { 860 msg := fmt.Sprintf(format, v...) 861 if strings.Contains(msg, "has high message lag") { 862 select { 863 case l.ch <- msg: 864 default: 865 } 866 } 867 } 868 869 // False triggering warnings on stream lag because not offsetting by failures. 870 func TestJetStreamClusterStreamLagWarning(t *testing.T) { 871 c := createJetStreamClusterExplicit(t, "R3S", 3) 872 defer c.shutdown() 873 874 nc, js := jsClientConnect(t, c.randomServer()) 875 defer nc.Close() 876 877 _, err := js.AddStream(&nats.StreamConfig{ 878 Name: "TEST", 879 Subjects: []string{"foo"}, 880 Replicas: 3, 881 }) 882 require_NoError(t, err) 883 884 sl := c.streamLeader("$G", "TEST") 885 886 l := &testStreamLagWarnLogger{ch: make(chan string, 10)} 887 sl.SetLogger(l, false, false) 888 889 // We only need to trigger post RAFT propose failures that increment mset.clfs. 890 // Dedupe with msgIDs is one, so we will use that. 891 m := nats.NewMsg("foo") 892 m.Data = []byte("OK") 893 m.Header.Set(JSMsgId, "zz") 894 895 // Make sure we know we will trip the warning threshold. 896 for i := 0; i < 2*streamLagWarnThreshold; i++ { 897 js.PublishMsgAsync(m) 898 } 899 select { 900 case <-js.PublishAsyncComplete(): 901 case <-time.After(5 * time.Second): 902 t.Fatalf("Did not receive completion signal") 903 } 904 905 select { 906 case msg := <-l.ch: 907 t.Fatalf("Unexpected msg lag warning seen: %s", msg) 908 case <-time.After(100 * time.Millisecond): 909 // OK 910 } 911 } 912 913 // https://github.com/nats-io/nats-server/issues/3603 914 func TestJetStreamClusterSignalPullConsumersOnDelete(t *testing.T) { 915 c := createJetStreamClusterExplicit(t, "R3S", 3) 916 defer c.shutdown() 917 918 nc, js := jsClientConnect(t, c.randomServer()) 919 defer nc.Close() 920 921 _, err := js.AddStream(&nats.StreamConfig{ 922 Name: "TEST", 923 Subjects: []string{"foo"}, 924 Replicas: 3, 925 }) 926 require_NoError(t, err) 927 928 // Create 2 pull consumers. 929 sub1, err := js.PullSubscribe("foo", "d1") 930 require_NoError(t, err) 931 932 sub2, err := js.PullSubscribe("foo", "d2") 933 require_NoError(t, err) 934 935 // We want to make sure we get kicked out prior to the timeout 936 // when consumers are being deleted or the parent stream is being deleted. 937 // Note this should be lower case, Go client needs to be updated. 938 expectedErr := errors.New("nats: consumer deleted") 939 940 // Queue up the delete for sub1 941 time.AfterFunc(250*time.Millisecond, func() { js.DeleteConsumer("TEST", "d1") }) 942 start := time.Now() 943 _, err = sub1.Fetch(1, nats.MaxWait(10*time.Second)) 944 require_Error(t, err, expectedErr) 945 946 // Check that we bailed early. 947 if time.Since(start) > time.Second { 948 t.Fatalf("Took to long to bail out on consumer delete") 949 } 950 951 time.AfterFunc(250*time.Millisecond, func() { js.DeleteStream("TEST") }) 952 start = time.Now() 953 _, err = sub2.Fetch(1, nats.MaxWait(10*time.Second)) 954 require_Error(t, err, expectedErr) 955 if time.Since(start) > time.Second { 956 t.Fatalf("Took to long to bail out on stream delete") 957 } 958 } 959 960 // https://github.com/nats-io/nats-server/issues/3559 961 func TestJetStreamClusterSourceWithOptStartTime(t *testing.T) { 962 s := RunBasicJetStreamServer(t) 963 defer s.Shutdown() 964 965 c := createJetStreamClusterExplicit(t, "R3S", 3) 966 defer c.shutdown() 967 968 test := func(t *testing.T, c *cluster, s *Server) { 969 970 replicas := 1 971 if c != nil { 972 s = c.randomServer() 973 replicas = 3 974 } 975 nc, js := jsClientConnect(t, s) 976 defer nc.Close() 977 978 _, err := js.AddStream(&nats.StreamConfig{ 979 Name: "TEST", 980 Subjects: []string{"foo"}, 981 Replicas: replicas, 982 }) 983 require_NoError(t, err) 984 985 yesterday := time.Now().Add(-24 * time.Hour) 986 987 _, err = js.AddStream(&nats.StreamConfig{ 988 Name: "SOURCE", 989 Replicas: replicas, 990 Sources: []*nats.StreamSource{{ 991 Name: "TEST", 992 OptStartTime: &yesterday, 993 }}, 994 }) 995 require_NoError(t, err) 996 997 _, err = js.AddStream(&nats.StreamConfig{ 998 Name: "MIRROR", 999 Replicas: replicas, 1000 Mirror: &nats.StreamSource{ 1001 Name: "TEST", 1002 OptStartTime: &yesterday, 1003 }, 1004 }) 1005 require_NoError(t, err) 1006 1007 total := 10 1008 for i := 0; i < total; i++ { 1009 sendStreamMsg(t, nc, "foo", "hello") 1010 } 1011 1012 checkCount := func(sname string, expected int) { 1013 t.Helper() 1014 checkFor(t, 10*time.Second, 50*time.Millisecond, func() error { 1015 si, err := js.StreamInfo(sname) 1016 if err != nil { 1017 return err 1018 } 1019 if n := si.State.Msgs; n != uint64(expected) { 1020 return fmt.Errorf("Expected stream %q to have %v messages, got %v", sname, expected, n) 1021 } 1022 return nil 1023 }) 1024 } 1025 1026 checkCount("TEST", 10) 1027 checkCount("SOURCE", 10) 1028 checkCount("MIRROR", 10) 1029 1030 err = js.PurgeStream("SOURCE") 1031 require_NoError(t, err) 1032 err = js.PurgeStream("MIRROR") 1033 require_NoError(t, err) 1034 1035 checkCount("TEST", 10) 1036 checkCount("SOURCE", 0) 1037 checkCount("MIRROR", 0) 1038 1039 nc.Close() 1040 if c != nil { 1041 c.stopAll() 1042 c.restartAll() 1043 1044 c.waitOnStreamLeader(globalAccountName, "TEST") 1045 c.waitOnStreamLeader(globalAccountName, "SOURCE") 1046 c.waitOnStreamLeader(globalAccountName, "MIRROR") 1047 1048 s = c.randomServer() 1049 } else { 1050 sd := s.JetStreamConfig().StoreDir 1051 s.Shutdown() 1052 s = RunJetStreamServerOnPort(-1, sd) 1053 defer s.Shutdown() 1054 } 1055 1056 // Wait a bit before checking because sync'ing (even with the defect) 1057 // would not happen right away. I tried with 1 sec and test would pass, 1058 // so need to be at least that much. 1059 time.Sleep(2 * time.Second) 1060 1061 nc, js = jsClientConnect(t, s) 1062 defer nc.Close() 1063 checkCount("TEST", 10) 1064 checkCount("SOURCE", 0) 1065 checkCount("MIRROR", 0) 1066 } 1067 1068 t.Run("standalone", func(t *testing.T) { test(t, nil, s) }) 1069 t.Run("cluster", func(t *testing.T) { test(t, c, nil) }) 1070 } 1071 1072 type networkCableUnplugged struct { 1073 net.Conn 1074 sync.Mutex 1075 unplugged bool 1076 wb bytes.Buffer 1077 wg sync.WaitGroup 1078 } 1079 1080 func (c *networkCableUnplugged) Write(b []byte) (int, error) { 1081 c.Lock() 1082 if c.unplugged { 1083 c.wb.Write(b) 1084 c.Unlock() 1085 return len(b), nil 1086 } else if c.wb.Len() > 0 { 1087 c.wb.Write(b) 1088 buf := c.wb.Bytes() 1089 c.wb.Reset() 1090 c.Unlock() 1091 if _, err := c.Conn.Write(buf); err != nil { 1092 return 0, err 1093 } 1094 return len(b), nil 1095 } 1096 c.Unlock() 1097 return c.Conn.Write(b) 1098 } 1099 1100 func (c *networkCableUnplugged) Read(b []byte) (int, error) { 1101 c.Lock() 1102 wait := c.unplugged 1103 c.Unlock() 1104 if wait { 1105 c.wg.Wait() 1106 } 1107 return c.Conn.Read(b) 1108 } 1109 1110 func TestJetStreamClusterScaleDownWhileNoQuorum(t *testing.T) { 1111 c := createJetStreamClusterExplicit(t, "R5S", 5) 1112 defer c.shutdown() 1113 1114 s := c.randomServer() 1115 nc, js := jsClientConnect(t, s) 1116 defer nc.Close() 1117 1118 si, err := js.AddStream(&nats.StreamConfig{ 1119 Name: "TEST", 1120 Subjects: []string{"foo"}, 1121 Replicas: 2, 1122 }) 1123 require_NoError(t, err) 1124 1125 for i := 0; i < 1000; i++ { 1126 sendStreamMsg(t, nc, "foo", "msg") 1127 } 1128 1129 // Let's have a server from this R2 stream be network partitionned. 1130 // We will take the leader, but doesn't have to be. 1131 // To simulate partition, we will replace all its routes with a 1132 // special connection that drops messages. 1133 sl := c.serverByName(si.Cluster.Leader) 1134 if s == sl { 1135 nc.Close() 1136 for s = c.randomServer(); s != sl; s = c.randomServer() { 1137 } 1138 nc, js = jsClientConnect(t, s) 1139 defer nc.Close() 1140 } 1141 1142 sl.mu.Lock() 1143 sl.forEachRoute(func(r *client) { 1144 r.mu.Lock() 1145 ncu := &networkCableUnplugged{Conn: r.nc, unplugged: true} 1146 ncu.wg.Add(1) 1147 r.nc = ncu 1148 r.mu.Unlock() 1149 }) 1150 sl.mu.Unlock() 1151 1152 // Wait for the stream info to fail 1153 checkFor(t, 10*time.Second, 100*time.Millisecond, func() error { 1154 si, err := js.StreamInfo("TEST", nats.MaxWait(time.Second)) 1155 if err != nil { 1156 return err 1157 } 1158 if si.Cluster.Leader == _EMPTY_ { 1159 return nil 1160 } 1161 return fmt.Errorf("stream still has a leader") 1162 }) 1163 1164 // Make sure if meta leader was on same server as stream leader we make sure 1165 // it elects new leader to receive update request. 1166 c.waitOnLeader() 1167 1168 // Now try to edit the stream by making it an R1. In some case we get 1169 // a context deadline error, in some no error. So don't check the returned error. 1170 js.UpdateStream(&nats.StreamConfig{ 1171 Name: "TEST", 1172 Subjects: []string{"foo"}, 1173 Replicas: 1, 1174 }, nats.MaxWait(5*time.Second)) 1175 1176 sl.mu.Lock() 1177 sl.forEachRoute(func(r *client) { 1178 r.mu.Lock() 1179 ncu := r.nc.(*networkCableUnplugged) 1180 ncu.Lock() 1181 ncu.unplugged = false 1182 ncu.wg.Done() 1183 ncu.Unlock() 1184 r.mu.Unlock() 1185 }) 1186 sl.mu.Unlock() 1187 1188 checkClusterFormed(t, c.servers...) 1189 c.waitOnStreamLeader(globalAccountName, "TEST") 1190 } 1191 1192 // We noticed that ha_assets enforcement seemed to not be upheld when assets created in a rapid fashion. 1193 func TestJetStreamClusterHAssetsEnforcement(t *testing.T) { 1194 tmpl := strings.Replace(jsClusterTempl, "store_dir:", "limits: {max_ha_assets: 2}, store_dir:", 1) 1195 c := createJetStreamClusterWithTemplateAndModHook(t, tmpl, "R3S", 3, nil) 1196 defer c.shutdown() 1197 1198 nc, js := jsClientConnect(t, c.randomServer()) 1199 defer nc.Close() 1200 1201 _, err := js.AddStream(&nats.StreamConfig{ 1202 Name: "TEST-1", 1203 Subjects: []string{"foo"}, 1204 Replicas: 3, 1205 }) 1206 require_NoError(t, err) 1207 1208 _, err = js.AddStream(&nats.StreamConfig{ 1209 Name: "TEST-2", 1210 Subjects: []string{"bar"}, 1211 Replicas: 3, 1212 }) 1213 require_NoError(t, err) 1214 1215 exceededErrs := []error{errors.New("system limit reached"), errors.New("no suitable peers")} 1216 1217 // Should fail. 1218 _, err = js.AddStream(&nats.StreamConfig{ 1219 Name: "TEST-3", 1220 Subjects: []string{"baz"}, 1221 Replicas: 3, 1222 }) 1223 require_Error(t, err, exceededErrs...) 1224 } 1225 1226 func TestJetStreamClusterInterestStreamConsumer(t *testing.T) { 1227 c := createJetStreamClusterExplicit(t, "R5S", 5) 1228 defer c.shutdown() 1229 1230 nc, js := jsClientConnect(t, c.randomServer()) 1231 defer nc.Close() 1232 1233 _, err := js.AddStream(&nats.StreamConfig{ 1234 Name: "TEST", 1235 Subjects: []string{"foo"}, 1236 Retention: nats.InterestPolicy, 1237 Replicas: 3, 1238 }) 1239 require_NoError(t, err) 1240 1241 var subs []*nats.Subscription 1242 ns := 5 1243 1244 for i := 0; i < ns; i++ { 1245 dn := fmt.Sprintf("d%d", i) 1246 sub, err := js.PullSubscribe("foo", dn) 1247 require_NoError(t, err) 1248 subs = append(subs, sub) 1249 } 1250 1251 // Send 10 msgs 1252 n := 10 1253 for i := 0; i < n; i++ { 1254 sendStreamMsg(t, nc, "foo", "msg") 1255 } 1256 1257 // Collect all the messages. 1258 var msgs []*nats.Msg 1259 for _, sub := range subs { 1260 lmsgs := fetchMsgs(t, sub, n, time.Second) 1261 if len(lmsgs) != n { 1262 t.Fatalf("Did not receive all msgs: %d vs %d", len(lmsgs), n) 1263 } 1264 msgs = append(msgs, lmsgs...) 1265 } 1266 1267 // Shuffle 1268 rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] }) 1269 for _, m := range msgs { 1270 m.AckSync() 1271 } 1272 // Make sure replicated acks are processed. 1273 time.Sleep(250 * time.Millisecond) 1274 1275 si, err := js.StreamInfo("TEST") 1276 require_NoError(t, err) 1277 1278 if si.State.Msgs != 0 { 1279 t.Fatalf("Should not have any messages left: %d of %d", si.State.Msgs, n) 1280 } 1281 } 1282 1283 func TestJetStreamClusterNoPanicOnStreamInfoWhenNoLeaderYet(t *testing.T) { 1284 c := createJetStreamClusterExplicit(t, "R3S", 3) 1285 defer c.shutdown() 1286 1287 nc := natsConnect(t, c.randomServer().ClientURL()) 1288 defer nc.Close() 1289 1290 js, _ := nc.JetStream(nats.MaxWait(500 * time.Millisecond)) 1291 1292 wg := sync.WaitGroup{} 1293 wg.Add(1) 1294 ch := make(chan struct{}) 1295 go func() { 1296 defer wg.Done() 1297 1298 for { 1299 js.StreamInfo("TEST") 1300 select { 1301 case <-ch: 1302 return 1303 case <-time.After(15 * time.Millisecond): 1304 } 1305 } 1306 }() 1307 1308 time.Sleep(250 * time.Millisecond) 1309 1310 // Don't care if this succeeds or not (could get a context deadline 1311 // due to the low MaxWait() when creating the context). 1312 js.AddStream(&nats.StreamConfig{ 1313 Name: "TEST", 1314 Subjects: []string{"foo"}, 1315 Replicas: 3, 1316 }) 1317 1318 close(ch) 1319 wg.Wait() 1320 } 1321 1322 // Issue https://github.com/nats-io/nats-server/issues/3630 1323 func TestJetStreamClusterPullConsumerAcksExtendInactivityThreshold(t *testing.T) { 1324 c := createJetStreamClusterExplicit(t, "R3S", 3) 1325 defer c.shutdown() 1326 1327 nc, js := jsClientConnect(t, c.randomServer()) 1328 defer nc.Close() 1329 1330 js.AddStream(&nats.StreamConfig{ 1331 Name: "TEST", 1332 Subjects: []string{"foo"}, 1333 Replicas: 3, 1334 }) 1335 1336 n := 10 1337 for i := 0; i < n; i++ { 1338 sendStreamMsg(t, nc, "foo", "msg") 1339 } 1340 1341 // Pull Consumer 1342 sub, err := js.PullSubscribe("foo", "d", nats.InactiveThreshold(time.Second)) 1343 require_NoError(t, err) 1344 1345 fetchMsgs(t, sub, n/2, time.Second) 1346 // Will wait for .5s. 1347 time.Sleep(500 * time.Millisecond) 1348 msgs := fetchMsgs(t, sub, n/2, time.Second) 1349 if len(msgs) != n/2 { 1350 t.Fatalf("Did not receive msgs: %d vs %d", len(msgs), n/2) 1351 } 1352 1353 // Wait for .5s. 1354 time.Sleep(500 * time.Millisecond) 1355 msgs[0].Ack() // Ack 1356 // Wait another .5s. 1357 time.Sleep(500 * time.Millisecond) 1358 msgs[1].Nak() // Nak 1359 // Wait another .5s. 1360 time.Sleep(500 * time.Millisecond) 1361 msgs[2].Term() // Term 1362 time.Sleep(500 * time.Millisecond) 1363 msgs[3].InProgress() // WIP 1364 1365 // The above should have kept the consumer alive. 1366 _, err = js.ConsumerInfo("TEST", "d") 1367 require_NoError(t, err) 1368 1369 // Make sure it gets cleaned up. 1370 time.Sleep(2 * time.Second) 1371 _, err = js.ConsumerInfo("TEST", "d") 1372 require_Error(t, err, nats.ErrConsumerNotFound) 1373 } 1374 1375 // https://github.com/nats-io/nats-server/issues/3677 1376 func TestJetStreamClusterParallelStreamCreation(t *testing.T) { 1377 c := createJetStreamClusterExplicit(t, "R3S", 3) 1378 defer c.shutdown() 1379 1380 np := 100 1381 1382 startCh := make(chan bool) 1383 errCh := make(chan error, np) 1384 1385 wg := sync.WaitGroup{} 1386 wg.Add(np) 1387 1388 start := sync.WaitGroup{} 1389 start.Add(np) 1390 1391 for i := 0; i < np; i++ { 1392 go func() { 1393 defer wg.Done() 1394 1395 // Individual connection 1396 nc, js := jsClientConnect(t, c.randomServer()) 1397 defer nc.Close() 1398 // Signal we are ready 1399 start.Done() 1400 // Make them all fire at once. 1401 <-startCh 1402 1403 if _, err := js.AddStream(&nats.StreamConfig{ 1404 Name: "TEST", 1405 Subjects: []string{"common.*.*"}, 1406 Replicas: 3, 1407 }); err != nil { 1408 errCh <- err 1409 } 1410 }() 1411 } 1412 1413 start.Wait() 1414 close(startCh) 1415 wg.Wait() 1416 1417 if len(errCh) > 0 { 1418 t.Fatalf("Expected no errors, got %d", len(errCh)) 1419 } 1420 } 1421 1422 // In addition to test above, if streams were attempted to be created in parallel 1423 // it could be that multiple raft groups would be created for the same asset. 1424 func TestJetStreamClusterParallelStreamCreationDupeRaftGroups(t *testing.T) { 1425 c := createJetStreamClusterExplicit(t, "R3S", 3) 1426 defer c.shutdown() 1427 1428 np := 20 1429 1430 startCh := make(chan bool) 1431 wg := sync.WaitGroup{} 1432 wg.Add(np) 1433 for i := 0; i < np; i++ { 1434 go func() { 1435 defer wg.Done() 1436 1437 // Individual connection 1438 nc, _ := jsClientConnect(t, c.randomServer()) 1439 js, _ := nc.JetStream(nats.MaxWait(time.Second)) 1440 defer nc.Close() 1441 1442 // Make them all fire at once. 1443 <-startCh 1444 1445 // Ignore errors in this test, care about raft group and metastate. 1446 js.AddStream(&nats.StreamConfig{ 1447 Name: "TEST", 1448 Subjects: []string{"common.*.*"}, 1449 Replicas: 3, 1450 }) 1451 }() 1452 } 1453 1454 close(startCh) 1455 wg.Wait() 1456 1457 // Restart a server too. 1458 s := c.randomServer() 1459 s.Shutdown() 1460 s = c.restartServer(s) 1461 c.waitOnLeader() 1462 c.waitOnStreamLeader(globalAccountName, "TEST") 1463 // Check that this server has only two active raft nodes after restart. 1464 if nrn := s.numRaftNodes(); nrn != 2 { 1465 t.Fatalf("Expected only two active raft nodes, got %d", nrn) 1466 } 1467 1468 // Make sure we only have 2 unique raft groups for all servers. 1469 // One for meta, one for stream. 1470 expected := 2 1471 rg := make(map[string]struct{}) 1472 for _, s := range c.servers { 1473 s.rnMu.RLock() 1474 for _, ni := range s.raftNodes { 1475 n := ni.(*raft) 1476 rg[n.Group()] = struct{}{} 1477 } 1478 s.rnMu.RUnlock() 1479 } 1480 if len(rg) != expected { 1481 t.Fatalf("Expected only %d distinct raft groups for all servers, go %d", expected, len(rg)) 1482 } 1483 } 1484 1485 func TestJetStreamClusterParallelConsumerCreation(t *testing.T) { 1486 c := createJetStreamClusterExplicit(t, "R3S", 3) 1487 defer c.shutdown() 1488 1489 nc, js := jsClientConnect(t, c.randomServer()) 1490 defer nc.Close() 1491 1492 _, err := js.AddStream(&nats.StreamConfig{ 1493 Name: "TEST", 1494 Subjects: []string{"common.*.*"}, 1495 Replicas: 3, 1496 }) 1497 require_NoError(t, err) 1498 c.waitOnStreamLeader(globalAccountName, "TEST") 1499 1500 np := 50 1501 1502 startCh := make(chan bool) 1503 errCh := make(chan error, np) 1504 1505 cfg := &nats.ConsumerConfig{ 1506 Durable: "dlc", 1507 Replicas: 3, 1508 } 1509 1510 wg := sync.WaitGroup{} 1511 swg := sync.WaitGroup{} 1512 wg.Add(np) 1513 swg.Add(np) 1514 1515 for i := 0; i < np; i++ { 1516 go func() { 1517 defer wg.Done() 1518 1519 // Individual connection 1520 nc, js := jsClientConnect(t, c.randomServer()) 1521 defer nc.Close() 1522 1523 swg.Done() 1524 1525 // Make them all fire at once. 1526 <-startCh 1527 1528 if _, err := js.AddConsumer("TEST", cfg); err != nil { 1529 errCh <- err 1530 } 1531 }() 1532 } 1533 1534 swg.Wait() 1535 close(startCh) 1536 1537 wg.Wait() 1538 1539 if len(errCh) > 0 { 1540 t.Fatalf("Expected no errors, got %d", len(errCh)) 1541 } 1542 1543 // Make sure we only have 3 unique raft groups for all servers. 1544 // One for meta, one for stream, one for consumer. 1545 expected := 3 1546 rg := make(map[string]struct{}) 1547 for _, s := range c.servers { 1548 s.rnMu.RLock() 1549 for _, ni := range s.raftNodes { 1550 n := ni.(*raft) 1551 rg[n.Group()] = struct{}{} 1552 } 1553 s.rnMu.RUnlock() 1554 } 1555 if len(rg) != expected { 1556 t.Fatalf("Expected only %d distinct raft groups for all servers, go %d", expected, len(rg)) 1557 } 1558 } 1559 1560 func TestJetStreamClusterGhostEphemeralsAfterRestart(t *testing.T) { 1561 c := createJetStreamClusterExplicit(t, "R3S", 3) 1562 defer c.shutdown() 1563 1564 nc, js := jsClientConnect(t, c.randomServer()) 1565 defer nc.Close() 1566 1567 _, err := js.AddStream(&nats.StreamConfig{ 1568 Name: "TEST", 1569 Subjects: []string{"foo"}, 1570 Replicas: 3, 1571 }) 1572 require_NoError(t, err) 1573 1574 // Add in 100 memory based ephemerals. 1575 for i := 0; i < 100; i++ { 1576 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 1577 Replicas: 1, 1578 InactiveThreshold: time.Second, 1579 MemoryStorage: true, 1580 }) 1581 require_NoError(t, err) 1582 } 1583 1584 // Grab random server. 1585 rs := c.randomServer() 1586 // Now shutdown cluster. 1587 c.stopAll() 1588 1589 // Let the consumers all expire. 1590 time.Sleep(2 * time.Second) 1591 1592 // Restart first and wait so that we know it will try cleanup without a metaleader. 1593 c.restartServer(rs) 1594 time.Sleep(time.Second) 1595 1596 c.restartAll() 1597 c.waitOnLeader() 1598 c.waitOnStreamLeader(globalAccountName, "TEST") 1599 1600 nc, _ = jsClientConnect(t, c.randomServer()) 1601 defer nc.Close() 1602 1603 subj := fmt.Sprintf(JSApiConsumerListT, "TEST") 1604 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 1605 m, err := nc.Request(subj, nil, time.Second) 1606 if err != nil { 1607 return err 1608 } 1609 var resp JSApiConsumerListResponse 1610 err = json.Unmarshal(m.Data, &resp) 1611 require_NoError(t, err) 1612 if len(resp.Consumers) != 0 { 1613 return fmt.Errorf("Still have %d consumers", len(resp.Consumers)) 1614 } 1615 if len(resp.Missing) != 0 { 1616 return fmt.Errorf("Still have %d missing consumers", len(resp.Missing)) 1617 } 1618 1619 return nil 1620 }) 1621 } 1622 1623 func TestJetStreamClusterReplacementPolicyAfterPeerRemove(t *testing.T) { 1624 // R3 scenario where there is a redundant node in each unique cloud so removing a peer should result in 1625 // an immediate replacement also preserving cloud uniqueness. 1626 1627 sc := createJetStreamClusterExplicit(t, "PR9", 9) 1628 sc.waitOnPeerCount(9) 1629 1630 reset := func(s *Server) { 1631 s.mu.Lock() 1632 rch := s.sys.resetCh 1633 s.mu.Unlock() 1634 if rch != nil { 1635 rch <- struct{}{} 1636 } 1637 s.sendStatszUpdate() 1638 } 1639 1640 tags := []string{"cloud:aws", "cloud:aws", "cloud:aws", "cloud:gcp", "cloud:gcp", "cloud:gcp", "cloud:az", "cloud:az", "cloud:az"} 1641 1642 var serverUTags = make(map[string]string) 1643 1644 for i, s := range sc.servers { 1645 s.optsMu.Lock() 1646 serverUTags[s.Name()] = tags[i] 1647 s.opts.Tags.Add(tags[i]) 1648 s.opts.JetStreamUniqueTag = "cloud" 1649 s.optsMu.Unlock() 1650 reset(s) 1651 } 1652 1653 ml := sc.leader() 1654 js := ml.getJetStream() 1655 require_True(t, js != nil) 1656 js.mu.RLock() 1657 cc := js.cluster 1658 require_True(t, cc != nil) 1659 1660 // Walk and make sure all tags are registered. 1661 expires := time.Now().Add(10 * time.Second) 1662 for time.Now().Before(expires) { 1663 allOK := true 1664 for _, p := range cc.meta.Peers() { 1665 si, ok := ml.nodeToInfo.Load(p.ID) 1666 require_True(t, ok) 1667 ni := si.(nodeInfo) 1668 if len(ni.tags) == 0 { 1669 allOK = false 1670 reset(sc.serverByName(ni.name)) 1671 } 1672 } 1673 if allOK { 1674 break 1675 } 1676 } 1677 js.mu.RUnlock() 1678 defer sc.shutdown() 1679 1680 sc.waitOnClusterReadyWithNumPeers(9) 1681 1682 s := sc.leader() 1683 nc, jsc := jsClientConnect(t, s) 1684 defer nc.Close() 1685 1686 _, err := jsc.AddStream(&nats.StreamConfig{ 1687 Name: "TEST", 1688 Subjects: []string{"foo"}, 1689 Replicas: 3, 1690 }) 1691 require_NoError(t, err) 1692 1693 sc.waitOnStreamLeader(globalAccountName, "TEST") 1694 1695 osi, err := jsc.StreamInfo("TEST") 1696 require_NoError(t, err) 1697 1698 // Double check original placement honors unique_tag 1699 var uTags = make(map[string]struct{}) 1700 1701 uTags[serverUTags[osi.Cluster.Leader]] = struct{}{} 1702 for _, replica := range osi.Cluster.Replicas { 1703 evalTag := serverUTags[replica.Name] 1704 if _, exists := uTags[evalTag]; !exists { 1705 uTags[evalTag] = struct{}{} 1706 continue 1707 } else { 1708 t.Fatalf("expected initial placement to honor unique_tag") 1709 } 1710 } 1711 1712 // Remove a peer and select replacement 5 times to avoid false good 1713 for i := 0; i < 5; i++ { 1714 // Remove 1 peer replica (this will be random cloud region as initial placement was randomized ordering) 1715 // After each successful iteration, osi will reflect the current RG peers 1716 toRemove := osi.Cluster.Replicas[0].Name 1717 resp, err := nc.Request(fmt.Sprintf(JSApiStreamRemovePeerT, "TEST"), []byte(`{"peer":"`+toRemove+`"}`), time.Second) 1718 require_NoError(t, err) 1719 var rpResp JSApiStreamRemovePeerResponse 1720 err = json.Unmarshal(resp.Data, &rpResp) 1721 require_NoError(t, err) 1722 require_True(t, rpResp.Success) 1723 1724 sc.waitOnStreamLeader(globalAccountName, "TEST") 1725 1726 checkFor(t, time.Second, 200*time.Millisecond, func() error { 1727 osi, err = jsc.StreamInfo("TEST") 1728 require_NoError(t, err) 1729 if len(osi.Cluster.Replicas) != 2 { 1730 return fmt.Errorf("expected R3, got R%d", len(osi.Cluster.Replicas)+1) 1731 } 1732 // STREAM.PEER.REMOVE is asynchronous command; make sure remove has occurred by 1733 // checking that the toRemove peer is gone. 1734 for _, replica := range osi.Cluster.Replicas { 1735 if replica.Name == toRemove { 1736 return fmt.Errorf("expected replaced replica, old replica still present") 1737 } 1738 } 1739 return nil 1740 }) 1741 1742 // Validate that replacement with new peer still honors 1743 uTags = make(map[string]struct{}) //reset 1744 1745 uTags[serverUTags[osi.Cluster.Leader]] = struct{}{} 1746 for _, replica := range osi.Cluster.Replicas { 1747 evalTag := serverUTags[replica.Name] 1748 if _, exists := uTags[evalTag]; !exists { 1749 uTags[evalTag] = struct{}{} 1750 continue 1751 } else { 1752 t.Fatalf("expected new peer and revised placement to honor unique_tag") 1753 } 1754 } 1755 } 1756 } 1757 1758 func TestJetStreamClusterReplacementPolicyAfterPeerRemoveNoPlace(t *testing.T) { 1759 // R3 scenario where there are exactly three unique cloud nodes, so removing a peer should NOT 1760 // result in a new peer 1761 1762 sc := createJetStreamClusterExplicit(t, "threeup", 3) 1763 sc.waitOnPeerCount(3) 1764 1765 reset := func(s *Server) { 1766 s.mu.Lock() 1767 rch := s.sys.resetCh 1768 s.mu.Unlock() 1769 if rch != nil { 1770 rch <- struct{}{} 1771 } 1772 s.sendStatszUpdate() 1773 } 1774 1775 tags := []string{"cloud:aws", "cloud:gcp", "cloud:az"} 1776 1777 var serverUTags = make(map[string]string) 1778 1779 for i, s := range sc.servers { 1780 s.optsMu.Lock() 1781 serverUTags[s.Name()] = tags[i] 1782 s.opts.Tags.Add(tags[i]) 1783 s.opts.JetStreamUniqueTag = "cloud" 1784 s.optsMu.Unlock() 1785 reset(s) 1786 } 1787 1788 ml := sc.leader() 1789 js := ml.getJetStream() 1790 require_True(t, js != nil) 1791 js.mu.RLock() 1792 cc := js.cluster 1793 require_True(t, cc != nil) 1794 1795 // Walk and make sure all tags are registered. 1796 expires := time.Now().Add(10 * time.Second) 1797 for time.Now().Before(expires) { 1798 allOK := true 1799 for _, p := range cc.meta.Peers() { 1800 si, ok := ml.nodeToInfo.Load(p.ID) 1801 require_True(t, ok) 1802 ni := si.(nodeInfo) 1803 if len(ni.tags) == 0 { 1804 allOK = false 1805 reset(sc.serverByName(ni.name)) 1806 } 1807 } 1808 if allOK { 1809 break 1810 } 1811 } 1812 js.mu.RUnlock() 1813 defer sc.shutdown() 1814 1815 sc.waitOnClusterReadyWithNumPeers(3) 1816 1817 s := sc.leader() 1818 nc, jsc := jsClientConnect(t, s) 1819 defer nc.Close() 1820 1821 _, err := jsc.AddStream(&nats.StreamConfig{ 1822 Name: "TEST", 1823 Subjects: []string{"foo"}, 1824 Replicas: 3, 1825 }) 1826 require_NoError(t, err) 1827 1828 sc.waitOnStreamLeader(globalAccountName, "TEST") 1829 1830 osi, err := jsc.StreamInfo("TEST") 1831 require_NoError(t, err) 1832 1833 // Double check original placement honors unique_tag 1834 var uTags = make(map[string]struct{}) 1835 1836 uTags[serverUTags[osi.Cluster.Leader]] = struct{}{} 1837 for _, replica := range osi.Cluster.Replicas { 1838 evalTag := serverUTags[replica.Name] 1839 if _, exists := uTags[evalTag]; !exists { 1840 uTags[evalTag] = struct{}{} 1841 continue 1842 } else { 1843 t.Fatalf("expected initial placement to honor unique_tag") 1844 } 1845 } 1846 1847 // Remove 1 peer replica (this will be random cloud region as initial placement was randomized ordering) 1848 _, err = nc.Request("$JS.API.STREAM.PEER.REMOVE.TEST", []byte(`{"peer":"`+osi.Cluster.Replicas[0].Name+`"}`), time.Second*10) 1849 require_NoError(t, err) 1850 1851 sc.waitOnStreamLeader(globalAccountName, "TEST") 1852 1853 // Verify R2 since no eligible peer can replace the removed peer without braking unique constraint 1854 checkFor(t, time.Second, 200*time.Millisecond, func() error { 1855 osi, err = jsc.StreamInfo("TEST") 1856 require_NoError(t, err) 1857 if len(osi.Cluster.Replicas) != 1 { 1858 return fmt.Errorf("expected R2, got R%d", len(osi.Cluster.Replicas)+1) 1859 } 1860 return nil 1861 }) 1862 1863 // Validate that remaining members still honor unique tags 1864 uTags = make(map[string]struct{}) //reset 1865 1866 uTags[serverUTags[osi.Cluster.Leader]] = struct{}{} 1867 for _, replica := range osi.Cluster.Replicas { 1868 evalTag := serverUTags[replica.Name] 1869 if _, exists := uTags[evalTag]; !exists { 1870 uTags[evalTag] = struct{}{} 1871 continue 1872 } else { 1873 t.Fatalf("expected revised placement to honor unique_tag") 1874 } 1875 } 1876 } 1877 1878 // https://github.com/nats-io/nats-server/issues/3191 1879 func TestJetStreamClusterLeafnodeDuplicateConsumerMessages(t *testing.T) { 1880 // Cluster B 1881 c := createJetStreamCluster(t, jsClusterTempl, "B", _EMPTY_, 2, 22020, false) 1882 defer c.shutdown() 1883 1884 // Cluster A 1885 // Domain is "A' 1886 lc := c.createLeafNodesWithStartPortAndDomain("A", 2, 22110, "A") 1887 defer lc.shutdown() 1888 1889 lc.waitOnClusterReady() 1890 1891 // We want A-S-1 connected to B-S-1 and A-S-2 connected to B-S-2 1892 // So adjust if needed. 1893 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 1894 for i, ls := range lc.servers { 1895 ls.mu.RLock() 1896 var remoteServer string 1897 for _, rc := range ls.leafs { 1898 rc.mu.Lock() 1899 remoteServer = rc.leaf.remoteServer 1900 rc.mu.Unlock() 1901 break 1902 } 1903 ls.mu.RUnlock() 1904 1905 wantedRemote := fmt.Sprintf("S-%d", i+1) 1906 if remoteServer != wantedRemote { 1907 ls.Shutdown() 1908 lc.restartServer(ls) 1909 return fmt.Errorf("Leafnode server %d not connected to %q", i+1, wantedRemote) 1910 } 1911 } 1912 return nil 1913 }) 1914 1915 // Wait on ready again. 1916 lc.waitOnClusterReady() 1917 1918 // Create a stream and a durable pull consumer on cluster A. 1919 lnc, ljs := jsClientConnect(t, lc.randomServer()) 1920 defer lnc.Close() 1921 1922 _, err := ljs.AddStream(&nats.StreamConfig{ 1923 Name: "TEST", 1924 Subjects: []string{"foo"}, 1925 Replicas: 2, 1926 }) 1927 require_NoError(t, err) 1928 1929 // Make sure stream leader is on S-1 1930 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 1931 si, err := ljs.StreamInfo("TEST") 1932 require_NoError(t, err) 1933 if si.Cluster.Leader == "A-S-1" { 1934 return nil 1935 } 1936 _, err = lnc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second) 1937 require_NoError(t, err) 1938 return fmt.Errorf("Stream leader not placed on A-S-1") 1939 }) 1940 1941 _, err = ljs.StreamInfo("TEST") 1942 require_NoError(t, err) 1943 1944 _, err = ljs.AddConsumer("TEST", &nats.ConsumerConfig{ 1945 Durable: "dlc", 1946 Replicas: 2, 1947 MaxDeliver: 1, 1948 AckPolicy: nats.AckNonePolicy, 1949 }) 1950 require_NoError(t, err) 1951 1952 // Make sure consumer leader is on S-2 1953 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 1954 ci, err := ljs.ConsumerInfo("TEST", "dlc") 1955 require_NoError(t, err) 1956 if ci.Cluster.Leader == "A-S-2" { 1957 return nil 1958 } 1959 _, err = lnc.Request(fmt.Sprintf(JSApiConsumerLeaderStepDownT, "TEST", "dlc"), nil, time.Second) 1960 require_NoError(t, err) 1961 return fmt.Errorf("Stream leader not placed on A-S-1") 1962 }) 1963 1964 _, err = ljs.ConsumerInfo("TEST", "dlc") 1965 require_NoError(t, err) 1966 1967 // Send 2 messages. 1968 sendStreamMsg(t, lnc, "foo", "M-1") 1969 sendStreamMsg(t, lnc, "foo", "M-2") 1970 1971 // Now bind apps to cluster B servers and bind to pull consumer. 1972 nc1, _ := jsClientConnect(t, c.servers[0]) 1973 defer nc1.Close() 1974 js1, err := nc1.JetStream(nats.Domain("A")) 1975 require_NoError(t, err) 1976 1977 sub1, err := js1.PullSubscribe("foo", "dlc", nats.BindStream("TEST")) 1978 require_NoError(t, err) 1979 defer sub1.Unsubscribe() 1980 1981 nc2, _ := jsClientConnect(t, c.servers[1]) 1982 defer nc2.Close() 1983 js2, err := nc2.JetStream(nats.Domain("A")) 1984 require_NoError(t, err) 1985 1986 sub2, err := js2.PullSubscribe("foo", "dlc", nats.BindStream("TEST")) 1987 require_NoError(t, err) 1988 defer sub2.Unsubscribe() 1989 1990 // Make sure we can properly get messages. 1991 msgs, err := sub1.Fetch(1) 1992 require_NoError(t, err) 1993 require_True(t, len(msgs) == 1) 1994 require_True(t, string(msgs[0].Data) == "M-1") 1995 1996 msgs, err = sub2.Fetch(1) 1997 require_NoError(t, err) 1998 require_True(t, len(msgs) == 1) 1999 require_True(t, string(msgs[0].Data) == "M-2") 2000 2001 // Make sure delivered state makes it to other server to not accidentally send M-2 again 2002 // and fail the test below. 2003 time.Sleep(250 * time.Millisecond) 2004 2005 // Now let's introduce and event, where A-S-2 will now reconnect after a restart to B-S-2 2006 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 2007 ls := lc.servers[1] 2008 wantedRemote := "S-1" 2009 var remoteServer string 2010 2011 ls.mu.RLock() 2012 for _, rc := range ls.leafs { 2013 rc.mu.Lock() 2014 remoteServer = rc.leaf.remoteServer 2015 rc.mu.Unlock() 2016 break 2017 } 2018 ls.mu.RUnlock() 2019 2020 if remoteServer != wantedRemote { 2021 ls.Shutdown() 2022 lc.restartServer(ls) 2023 return fmt.Errorf("Leafnode server not connected to %q", wantedRemote) 2024 } 2025 return nil 2026 }) 2027 2028 // Wait on ready again. 2029 lc.waitOnClusterReady() 2030 lc.waitOnStreamLeader(globalAccountName, "TEST") 2031 lc.waitOnConsumerLeader(globalAccountName, "TEST", "dlc") 2032 2033 // Send 2 more messages. 2034 sendStreamMsg(t, lnc, "foo", "M-3") 2035 sendStreamMsg(t, lnc, "foo", "M-4") 2036 2037 msgs, err = sub1.Fetch(2) 2038 require_NoError(t, err) 2039 require_True(t, len(msgs) == 2) 2040 require_True(t, string(msgs[0].Data) == "M-3") 2041 require_True(t, string(msgs[1].Data) == "M-4") 2042 2043 // Send 2 more messages. 2044 sendStreamMsg(t, lnc, "foo", "M-5") 2045 sendStreamMsg(t, lnc, "foo", "M-6") 2046 2047 msgs, err = sub2.Fetch(2) 2048 require_NoError(t, err) 2049 require_True(t, len(msgs) == 2) 2050 require_True(t, string(msgs[0].Data) == "M-5") 2051 require_True(t, string(msgs[1].Data) == "M-6") 2052 } 2053 2054 func snapRGSet(pFlag bool, banner string, osi *nats.StreamInfo) *map[string]struct{} { 2055 var snapSet = make(map[string]struct{}) 2056 if pFlag { 2057 fmt.Println(banner) 2058 } 2059 if osi == nil { 2060 if pFlag { 2061 fmt.Printf("bonkers!\n") 2062 } 2063 return nil 2064 } 2065 2066 snapSet[osi.Cluster.Leader] = struct{}{} 2067 if pFlag { 2068 fmt.Printf("Leader: %s\n", osi.Cluster.Leader) 2069 } 2070 for _, replica := range osi.Cluster.Replicas { 2071 snapSet[replica.Name] = struct{}{} 2072 if pFlag { 2073 fmt.Printf("Replica: %s\n", replica.Name) 2074 } 2075 } 2076 2077 return &snapSet 2078 } 2079 2080 func TestJetStreamClusterAfterPeerRemoveZeroState(t *testing.T) { 2081 // R3 scenario (w/messages) in a 4-node cluster. Peer remove from RG and add back to same RG later. 2082 // Validate that original peer brought no memory or issues from its previous RG tour of duty, specifically 2083 // that the restored peer has the correct filestore usage bytes for the asset. 2084 var err error 2085 2086 sc := createJetStreamClusterExplicit(t, "cl4", 4) 2087 defer sc.shutdown() 2088 2089 sc.waitOnClusterReadyWithNumPeers(4) 2090 2091 s := sc.leader() 2092 nc, jsc := jsClientConnect(t, s) 2093 defer nc.Close() 2094 2095 _, err = jsc.AddStream(&nats.StreamConfig{ 2096 Name: "foo", 2097 Subjects: []string{"foo.*"}, 2098 Replicas: 3, 2099 }) 2100 require_NoError(t, err) 2101 2102 sc.waitOnStreamLeader(globalAccountName, "foo") 2103 2104 osi, err := jsc.StreamInfo("foo") 2105 require_NoError(t, err) 2106 2107 // make sure 0 msgs 2108 require_True(t, osi.State.Msgs == 0) 2109 2110 // load up messages 2111 toSend := 10000 2112 // storage bytes with JS message overhead 2113 assetStoreBytesExpected := uint64(460000) 2114 2115 for i := 1; i <= toSend; i++ { 2116 msg := []byte("Hello World") 2117 if _, err = jsc.Publish("foo.a", msg); err != nil { 2118 t.Fatalf("unexpected publish error: %v", err) 2119 } 2120 } 2121 2122 osi, err = jsc.StreamInfo("foo") 2123 require_NoError(t, err) 2124 2125 // make sure 10000 msgs 2126 require_True(t, osi.State.Msgs == uint64(toSend)) 2127 2128 origSet := *snapRGSet(false, "== Orig RG Set ==", osi) 2129 2130 // remove 1 peer replica (1 of 2 non-leaders) 2131 origPeer := osi.Cluster.Replicas[0].Name 2132 resp, err := nc.Request(fmt.Sprintf(JSApiStreamRemovePeerT, "foo"), []byte(`{"peer":"`+origPeer+`"}`), time.Second) 2133 require_NoError(t, err) 2134 var rpResp JSApiStreamRemovePeerResponse 2135 err = json.Unmarshal(resp.Data, &rpResp) 2136 require_NoError(t, err) 2137 require_True(t, rpResp.Success) 2138 2139 // validate the origPeer is removed with a replacement newPeer 2140 sc.waitOnStreamLeader(globalAccountName, "foo") 2141 checkFor(t, time.Second, 200*time.Millisecond, func() error { 2142 osi, err = jsc.StreamInfo("foo") 2143 require_NoError(t, err) 2144 if len(osi.Cluster.Replicas) != 2 { 2145 return fmt.Errorf("expected R3, got R%d", len(osi.Cluster.Replicas)+1) 2146 } 2147 // STREAM.PEER.REMOVE is asynchronous command; make sure remove has occurred 2148 for _, replica := range osi.Cluster.Replicas { 2149 if replica.Name == origPeer { 2150 return fmt.Errorf("expected replaced replica, old replica still present") 2151 } 2152 } 2153 return nil 2154 }) 2155 2156 // identify the new peer 2157 var newPeer string 2158 osi, err = jsc.StreamInfo("foo") 2159 require_NoError(t, err) 2160 newSet := *snapRGSet(false, "== New RG Set ==", osi) 2161 for peer := range newSet { 2162 _, ok := origSet[peer] 2163 if !ok { 2164 newPeer = peer 2165 break 2166 } 2167 } 2168 require_True(t, newPeer != "") 2169 2170 // kick out newPeer which will cause origPeer to be assigned to the RG again 2171 resp, err = nc.Request(fmt.Sprintf(JSApiStreamRemovePeerT, "foo"), []byte(`{"peer":"`+newPeer+`"}`), time.Second) 2172 require_NoError(t, err) 2173 err = json.Unmarshal(resp.Data, &rpResp) 2174 require_NoError(t, err) 2175 require_True(t, rpResp.Success) 2176 2177 // validate the newPeer is removed and R3 has reformed (with origPeer) 2178 sc.waitOnStreamLeader(globalAccountName, "foo") 2179 checkFor(t, time.Second, 200*time.Millisecond, func() error { 2180 osi, err = jsc.StreamInfo("foo") 2181 require_NoError(t, err) 2182 if len(osi.Cluster.Replicas) != 2 { 2183 return fmt.Errorf("expected R3, got R%d", len(osi.Cluster.Replicas)+1) 2184 } 2185 // STREAM.PEER.REMOVE is asynchronous command; make sure remove has occurred 2186 for _, replica := range osi.Cluster.Replicas { 2187 if replica.Name == newPeer { 2188 return fmt.Errorf("expected replaced replica, old replica still present") 2189 } 2190 } 2191 return nil 2192 }) 2193 2194 osi, err = jsc.StreamInfo("foo") 2195 require_NoError(t, err) 2196 2197 // make sure all msgs reported in stream at this point with original leader 2198 require_True(t, osi.State.Msgs == uint64(toSend)) 2199 2200 snapRGSet(false, "== RG Set w/origPeer Back ==", osi) 2201 2202 // get a handle to original peer server 2203 var origServer *Server = sc.serverByName(origPeer) 2204 if origServer == nil { 2205 t.Fatalf("expected to get a handle to original peer server by name") 2206 } 2207 2208 checkFor(t, time.Second, 200*time.Millisecond, func() error { 2209 jszResult, err := origServer.Jsz(nil) 2210 require_NoError(t, err) 2211 if jszResult.Store != assetStoreBytesExpected { 2212 return fmt.Errorf("expected %d storage on orig peer, got %d", assetStoreBytesExpected, jszResult.Store) 2213 } 2214 return nil 2215 }) 2216 } 2217 2218 func TestJetStreamClusterMemLeaderRestart(t *testing.T) { 2219 // Test if R3 clustered mem store asset leader server restarted, that asset remains stable with final quorum 2220 c := createJetStreamClusterExplicit(t, "R3S", 3) 2221 defer c.shutdown() 2222 2223 ml := c.leader() 2224 nc, jsc := jsClientConnect(t, ml) 2225 defer nc.Close() 2226 2227 _, err := jsc.AddStream(&nats.StreamConfig{ 2228 Name: "foo", 2229 Storage: nats.MemoryStorage, 2230 Subjects: []string{"foo.*"}, 2231 Replicas: 3, 2232 }) 2233 require_NoError(t, err) 2234 2235 // load up messages 2236 toSend := 10000 2237 for i := 1; i <= toSend; i++ { 2238 msg := []byte("Hello World") 2239 if _, err = jsc.Publish("foo.a", msg); err != nil { 2240 t.Fatalf("unexpected publish error: %v", err) 2241 } 2242 } 2243 2244 osi, err := jsc.StreamInfo("foo") 2245 require_NoError(t, err) 2246 // make sure 10000 msgs 2247 require_True(t, osi.State.Msgs == uint64(toSend)) 2248 2249 // Shutdown the stream leader server 2250 rs := c.serverByName(osi.Cluster.Leader) 2251 rs.Shutdown() 2252 2253 // Make sure that we have a META leader (there can always be a re-election) 2254 c.waitOnLeader() 2255 c.waitOnStreamLeader(globalAccountName, "foo") 2256 2257 // Should still have quorum and a new leader 2258 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 2259 osi, err = jsc.StreamInfo("foo") 2260 if err != nil { 2261 return fmt.Errorf("expected healthy stream asset, got %s", err.Error()) 2262 } 2263 if osi.Cluster.Leader == _EMPTY_ { 2264 return fmt.Errorf("expected healthy stream asset with new leader") 2265 } 2266 if osi.State.Msgs != uint64(toSend) { 2267 return fmt.Errorf("expected healthy stream asset %d messages, got %d messages", toSend, osi.State.Msgs) 2268 } 2269 return nil 2270 }) 2271 2272 // Now restart the old leader peer (old stream state) 2273 oldrs := rs 2274 rs, _ = RunServerWithConfig(rs.getOpts().ConfigFile) 2275 defer rs.Shutdown() 2276 2277 // Replaced old with new server 2278 for i := 0; i < len(c.servers); i++ { 2279 if c.servers[i] == oldrs { 2280 c.servers[i] = rs 2281 } 2282 } 2283 2284 // Wait for cluster to be formed 2285 checkClusterFormed(t, c.servers...) 2286 2287 // Make sure that we have a leader (there can always be a re-election) 2288 c.waitOnLeader() 2289 2290 // Can we get stream info after return 2291 osi, err = jsc.StreamInfo("foo") 2292 if err != nil { 2293 t.Fatalf("expected stream asset info return, got %s", err.Error()) 2294 } 2295 2296 // When asset leader came back did we re-form with quorum 2297 if osi.Cluster.Leader == "" { 2298 t.Fatalf("expected a current leader after old leader restarted") 2299 } 2300 } 2301 2302 // Customer reported R1 consumers that seemed to be ghosted after server restart. 2303 func TestJetStreamClusterLostConsumers(t *testing.T) { 2304 c := createJetStreamClusterExplicit(t, "GHOST", 3) 2305 defer c.shutdown() 2306 2307 nc, js := jsClientConnect(t, c.randomServer()) 2308 defer nc.Close() 2309 2310 _, err := js.AddStream(&nats.StreamConfig{ 2311 Name: "TEST", 2312 Subjects: []string{"events.>"}, 2313 Replicas: 3, 2314 }) 2315 require_NoError(t, err) 2316 2317 for i := 0; i < 10; i++ { 2318 for j := 0; j < 10; j++ { 2319 _, err := js.Publish(fmt.Sprintf("events.%d.%d", i, j), []byte("test")) 2320 require_NoError(t, err) 2321 } 2322 } 2323 2324 s := c.randomServer() 2325 s.Shutdown() 2326 2327 c.waitOnLeader() 2328 c.waitOnStreamLeader(globalAccountName, "TEST") 2329 2330 nc, _ = jsClientConnect(t, c.randomServer()) 2331 defer nc.Close() 2332 2333 cc := CreateConsumerRequest{ 2334 Stream: "TEST", 2335 Config: ConsumerConfig{ 2336 AckPolicy: AckExplicit, 2337 }, 2338 } 2339 req, err := json.Marshal(cc) 2340 require_NoError(t, err) 2341 2342 reqSubj := fmt.Sprintf(JSApiConsumerCreateT, "TEST") 2343 2344 // Now create 50 consumers. We do not wait for the answer. 2345 for i := 0; i < 50; i++ { 2346 nc.Publish(reqSubj, req) 2347 } 2348 nc.Flush() 2349 2350 // Grab the meta leader. 2351 ml := c.leader() 2352 require_NoError(t, ml.JetStreamSnapshotMeta()) 2353 2354 numConsumerAssignments := func(s *Server) int { 2355 t.Helper() 2356 js := s.getJetStream() 2357 js.mu.RLock() 2358 defer js.mu.RUnlock() 2359 cc := js.cluster 2360 for _, asa := range cc.streams { 2361 for _, sa := range asa { 2362 return len(sa.consumers) 2363 } 2364 } 2365 return 0 2366 } 2367 2368 checkFor(t, time.Second, 100*time.Millisecond, func() error { 2369 num := numConsumerAssignments(ml) 2370 if num == 50 { 2371 return nil 2372 } 2373 return fmt.Errorf("Consumers is only %d", num) 2374 }) 2375 2376 // Restart the server we shutdown. We snapshotted to the snapshot 2377 // has to fill in the new consumers. 2378 // The bug would fail to add them to the meta state since the stream 2379 // existed. 2380 s = c.restartServer(s) 2381 2382 checkFor(t, time.Second, 100*time.Millisecond, func() error { 2383 num := numConsumerAssignments(s) 2384 if num == 50 { 2385 return nil 2386 } 2387 return fmt.Errorf("Consumers is only %d", num) 2388 }) 2389 } 2390 2391 // https://github.com/nats-io/nats-server/issues/3636 2392 func TestJetStreamClusterScaleDownDuringServerOffline(t *testing.T) { 2393 c := createJetStreamClusterExplicit(t, "R3S", 3) 2394 defer c.shutdown() 2395 2396 nc, js := jsClientConnect(t, c.randomServer()) 2397 defer nc.Close() 2398 2399 _, err := js.AddStream(&nats.StreamConfig{ 2400 Name: "TEST", 2401 Subjects: []string{"foo"}, 2402 Replicas: 3, 2403 }) 2404 require_NoError(t, err) 2405 2406 for i := 0; i < 100; i++ { 2407 sendStreamMsg(t, nc, "foo", "hello") 2408 } 2409 2410 s := c.randomNonStreamLeader(globalAccountName, "TEST") 2411 s.Shutdown() 2412 2413 c.waitOnLeader() 2414 2415 nc, js = jsClientConnect(t, c.randomServer()) 2416 defer nc.Close() 2417 2418 _, err = js.UpdateStream(&nats.StreamConfig{ 2419 Name: "TEST", 2420 Subjects: []string{"foo"}, 2421 Replicas: 1, 2422 }) 2423 require_NoError(t, err) 2424 2425 s = c.restartServer(s) 2426 checkFor(t, time.Second, 200*time.Millisecond, func() error { 2427 hs := s.healthz(nil) 2428 if hs.Error != _EMPTY_ { 2429 return errors.New(hs.Error) 2430 } 2431 return nil 2432 }) 2433 } 2434 2435 // Reported by a customer manually upgrading their streams to support direct gets. 2436 // Worked if single replica but not in clustered mode. 2437 func TestJetStreamClusterDirectGetStreamUpgrade(t *testing.T) { 2438 c := createJetStreamClusterExplicit(t, "R3S", 3) 2439 defer c.shutdown() 2440 2441 nc, js := jsClientConnect(t, c.randomServer()) 2442 defer nc.Close() 2443 2444 _, err := js.AddStream(&nats.StreamConfig{ 2445 Name: "KV_TEST", 2446 Subjects: []string{"$KV.TEST.>"}, 2447 Discard: nats.DiscardNew, 2448 MaxMsgsPerSubject: 1, 2449 DenyDelete: true, 2450 Replicas: 3, 2451 }) 2452 require_NoError(t, err) 2453 2454 kv, err := js.KeyValue("TEST") 2455 require_NoError(t, err) 2456 2457 _, err = kv.PutString("name", "derek") 2458 require_NoError(t, err) 2459 2460 entry, err := kv.Get("name") 2461 require_NoError(t, err) 2462 require_True(t, string(entry.Value()) == "derek") 2463 2464 // Now simulate a update to the stream to support direct gets. 2465 _, err = js.UpdateStream(&nats.StreamConfig{ 2466 Name: "KV_TEST", 2467 Subjects: []string{"$KV.TEST.>"}, 2468 Discard: nats.DiscardNew, 2469 MaxMsgsPerSubject: 1, 2470 DenyDelete: true, 2471 AllowDirect: true, 2472 Replicas: 3, 2473 }) 2474 require_NoError(t, err) 2475 2476 // Rebind to KV to make sure we DIRECT version of Get(). 2477 kv, err = js.KeyValue("TEST") 2478 require_NoError(t, err) 2479 2480 // Make sure direct get works. 2481 entry, err = kv.Get("name") 2482 require_NoError(t, err) 2483 require_True(t, string(entry.Value()) == "derek") 2484 } 2485 2486 // For interest (or workqueue) based streams its important to match the replication factor. 2487 // This was the case but now that more control over consumer creation is allowed its possible 2488 // to create a consumer where the replication factor does not match. This could cause 2489 // instability in the state between servers and cause problems on leader switches. 2490 func TestJetStreamClusterInterestPolicyStreamForConsumersToMatchRFactor(t *testing.T) { 2491 c := createJetStreamClusterExplicit(t, "R3S", 3) 2492 defer c.shutdown() 2493 2494 nc, js := jsClientConnect(t, c.randomServer()) 2495 defer nc.Close() 2496 2497 _, err := js.AddStream(&nats.StreamConfig{ 2498 Name: "TEST", 2499 Subjects: []string{"foo"}, 2500 Retention: nats.InterestPolicy, 2501 Replicas: 3, 2502 }) 2503 require_NoError(t, err) 2504 2505 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 2506 Durable: "XX", 2507 AckPolicy: nats.AckExplicitPolicy, 2508 Replicas: 1, 2509 }) 2510 2511 require_Error(t, err, NewJSConsumerReplicasShouldMatchStreamError()) 2512 } 2513 2514 // https://github.com/nats-io/nats-server/issues/3791 2515 func TestJetStreamClusterKVWatchersWithServerDown(t *testing.T) { 2516 c := createJetStreamClusterExplicit(t, "R3S", 3) 2517 defer c.shutdown() 2518 2519 nc, js := jsClientConnect(t, c.randomServer()) 2520 defer nc.Close() 2521 2522 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 2523 Bucket: "TEST", 2524 Replicas: 3, 2525 }) 2526 require_NoError(t, err) 2527 2528 kv.PutString("foo", "bar") 2529 kv.PutString("foo", "baz") 2530 2531 // Shutdown a follower. 2532 s := c.randomNonStreamLeader(globalAccountName, "KV_TEST") 2533 s.Shutdown() 2534 c.waitOnLeader() 2535 2536 nc, _ = jsClientConnect(t, c.randomServer()) 2537 defer nc.Close() 2538 2539 js, err = nc.JetStream(nats.MaxWait(2 * time.Second)) 2540 require_NoError(t, err) 2541 2542 kv, err = js.KeyValue("TEST") 2543 require_NoError(t, err) 2544 2545 for i := 0; i < 100; i++ { 2546 w, err := kv.Watch("foo") 2547 require_NoError(t, err) 2548 w.Stop() 2549 } 2550 } 2551 2552 // TestJetStreamClusterCurrentVsHealth is designed to show the 2553 // difference between "current" and "healthy" when async publishes 2554 // outpace the rate at which they can be applied. 2555 func TestJetStreamClusterCurrentVsHealth(t *testing.T) { 2556 c := createJetStreamClusterExplicit(t, "R3S", 3) 2557 defer c.shutdown() 2558 2559 c.waitOnLeader() 2560 server := c.randomNonLeader() 2561 2562 nc, js := jsClientConnect(t, server) 2563 defer nc.Close() 2564 2565 _, err := js.AddStream(&nats.StreamConfig{ 2566 Name: "TEST", 2567 Subjects: []string{"foo"}, 2568 Replicas: 3, 2569 }) 2570 require_NoError(t, err) 2571 2572 server = c.randomNonStreamLeader(globalAccountName, "TEST") 2573 stream, err := server.GlobalAccount().lookupStream("TEST") 2574 require_NoError(t, err) 2575 2576 raft, ok := stream.raftGroup().node.(*raft) 2577 require_True(t, ok) 2578 2579 for i := 0; i < 1000; i++ { 2580 _, err := js.PublishAsync("foo", []byte("bar")) 2581 require_NoError(t, err) 2582 2583 raft.RLock() 2584 commit := raft.commit 2585 applied := raft.applied 2586 raft.RUnlock() 2587 2588 current := raft.Current() 2589 healthy := raft.Healthy() 2590 2591 if !current || !healthy || commit != applied { 2592 t.Logf( 2593 "%d | Current %v, healthy %v, commit %d, applied %d, pending %d", 2594 i, current, healthy, commit, applied, commit-applied, 2595 ) 2596 } 2597 } 2598 } 2599 2600 // Several users and customers use this setup, but many times across leafnodes. 2601 // This should be allowed in same account since we are really protecting against 2602 // multiple pub acks with cycle detection. 2603 func TestJetStreamClusterActiveActiveSourcedStreams(t *testing.T) { 2604 c := createJetStreamClusterExplicit(t, "R3S", 3) 2605 defer c.shutdown() 2606 2607 nc, js := jsClientConnect(t, c.randomServer()) 2608 defer nc.Close() 2609 2610 _, err := js.AddStream(&nats.StreamConfig{ 2611 Name: "A", 2612 Subjects: []string{"A.>"}, 2613 }) 2614 require_NoError(t, err) 2615 2616 _, err = js.AddStream(&nats.StreamConfig{ 2617 Name: "B", 2618 Subjects: []string{"B.>"}, 2619 }) 2620 require_NoError(t, err) 2621 2622 _, err = js.UpdateStream(&nats.StreamConfig{ 2623 Name: "A", 2624 Subjects: []string{"A.>"}, 2625 Sources: []*nats.StreamSource{{ 2626 Name: "B", 2627 FilterSubject: "B.>", 2628 }}, 2629 }) 2630 require_NoError(t, err) 2631 2632 // Before this would fail. 2633 _, err = js.UpdateStream(&nats.StreamConfig{ 2634 Name: "B", 2635 Subjects: []string{"B.>"}, 2636 Sources: []*nats.StreamSource{{ 2637 Name: "A", 2638 FilterSubject: "A.>", 2639 }}, 2640 }) 2641 require_NoError(t, err) 2642 } 2643 2644 func TestJetStreamClusterUpdateConsumerShouldNotForceDeleteOnRestart(t *testing.T) { 2645 c := createJetStreamClusterExplicit(t, "R7S", 7) 2646 defer c.shutdown() 2647 2648 nc, js := jsClientConnect(t, c.randomServer()) 2649 defer nc.Close() 2650 2651 _, err := js.AddStream(&nats.StreamConfig{ 2652 Name: "TEST", 2653 Subjects: []string{"foo", "bar"}, 2654 Replicas: 3, 2655 }) 2656 require_NoError(t, err) 2657 2658 ci, err := js.AddConsumer("TEST", &nats.ConsumerConfig{ 2659 Durable: "D", 2660 DeliverSubject: "_no_bind_", 2661 }) 2662 require_NoError(t, err) 2663 2664 // Shutdown a consumer follower. 2665 nc.Close() 2666 s := c.serverByName(ci.Cluster.Replicas[0].Name) 2667 s.Shutdown() 2668 2669 c.waitOnLeader() 2670 2671 nc, js = jsClientConnect(t, c.randomServer()) 2672 defer nc.Close() 2673 2674 // Change delivery subject. 2675 _, err = js.UpdateConsumer("TEST", &nats.ConsumerConfig{ 2676 Durable: "D", 2677 DeliverSubject: "_d_", 2678 }) 2679 require_NoError(t, err) 2680 2681 // Create interest in new and old deliver subject. 2682 _, err = nc.SubscribeSync("_d_") 2683 require_NoError(t, err) 2684 _, err = nc.SubscribeSync("_no_bind_") 2685 require_NoError(t, err) 2686 nc.Flush() 2687 2688 c.restartServer(s) 2689 c.waitOnAllCurrent() 2690 2691 // Wait on bad error that would cleanup consumer. 2692 time.Sleep(time.Second) 2693 2694 _, err = js.ConsumerInfo("TEST", "D") 2695 require_NoError(t, err) 2696 } 2697 2698 func TestJetStreamClusterInterestPolicyEphemeral(t *testing.T) { 2699 c := createJetStreamClusterExplicit(t, "R3S", 3) 2700 defer c.shutdown() 2701 2702 for _, test := range []struct { 2703 testName string 2704 stream string 2705 subject string 2706 durable string 2707 name string 2708 }{ 2709 {testName: "InterestWithDurable", durable: "eph", subject: "intdur", stream: "INT_DUR"}, 2710 {testName: "InterestWithName", name: "eph", subject: "inteph", stream: "INT_EPH"}, 2711 } { 2712 t.Run(test.testName, func(t *testing.T) { 2713 var err error 2714 2715 nc, js := jsClientConnect(t, c.randomServer()) 2716 defer nc.Close() 2717 2718 _, err = js.AddStream(&nats.StreamConfig{ 2719 Name: test.stream, 2720 Subjects: []string{test.subject}, 2721 Retention: nats.LimitsPolicy, 2722 Replicas: 3, 2723 }) 2724 require_NoError(t, err) 2725 2726 const inactiveThreshold = time.Second 2727 2728 _, err = js.AddConsumer(test.stream, &nats.ConsumerConfig{ 2729 DeliverSubject: nats.NewInbox(), 2730 AckPolicy: nats.AckExplicitPolicy, 2731 InactiveThreshold: inactiveThreshold, 2732 Durable: test.durable, 2733 Name: test.name, 2734 }) 2735 require_NoError(t, err) 2736 2737 name := test.durable 2738 if test.durable == _EMPTY_ { 2739 name = test.name 2740 } 2741 2742 const msgs = 5_000 2743 done, count := make(chan bool, 1), 0 2744 2745 sub, err := js.Subscribe(_EMPTY_, func(msg *nats.Msg) { 2746 require_NoError(t, msg.Ack()) 2747 count++ 2748 if count >= msgs { 2749 select { 2750 case done <- true: 2751 default: 2752 } 2753 } 2754 }, nats.Bind(test.stream, name), nats.ManualAck()) 2755 require_NoError(t, err) 2756 2757 // This happens only if we start publishing messages after consumer was created. 2758 pubDone := make(chan struct{}) 2759 go func(subject string) { 2760 for i := 0; i < msgs; i++ { 2761 js.Publish(subject, []byte("DATA")) 2762 } 2763 close(pubDone) 2764 }(test.subject) 2765 2766 // Wait for inactive threshold to expire and all messages to be published and received 2767 // Bug is we clean up active consumers when we should not. 2768 time.Sleep(3 * inactiveThreshold / 2) 2769 2770 select { 2771 case <-pubDone: 2772 case <-time.After(10 * time.Second): 2773 t.Fatalf("Did not receive completion signal") 2774 } 2775 2776 info, err := js.ConsumerInfo(test.stream, name) 2777 if err != nil { 2778 t.Fatalf("Expected to be able to retrieve consumer: %v", err) 2779 } 2780 require_True(t, info.Delivered.Stream == msgs) 2781 2782 // Stop the subscription and remove the interest. 2783 err = sub.Unsubscribe() 2784 require_NoError(t, err) 2785 2786 // Now wait for interest inactivity threshold to kick in. 2787 time.Sleep(3 * inactiveThreshold / 2) 2788 2789 // Check if the consumer has been removed. 2790 _, err = js.ConsumerInfo(test.stream, name) 2791 require_Error(t, err, nats.ErrConsumerNotFound) 2792 }) 2793 } 2794 } 2795 2796 // TestJetStreamClusterWALBuildupOnNoOpPull tests whether or not the consumer 2797 // RAFT log is being compacted when the stream is idle but we are performing 2798 // lots of fetches. Otherwise the disk usage just spirals out of control if 2799 // there are no other state changes to trigger a compaction. 2800 func TestJetStreamClusterWALBuildupOnNoOpPull(t *testing.T) { 2801 c := createJetStreamClusterExplicit(t, "R3S", 3) 2802 defer c.shutdown() 2803 2804 nc, js := jsClientConnect(t, c.randomServer()) 2805 defer nc.Close() 2806 2807 _, err := js.AddStream(&nats.StreamConfig{ 2808 Name: "TEST", 2809 Subjects: []string{"foo"}, 2810 Replicas: 3, 2811 }) 2812 require_NoError(t, err) 2813 2814 sub, err := js.PullSubscribe( 2815 "foo", 2816 "durable", 2817 nats.ConsumerReplicas(3), 2818 ) 2819 require_NoError(t, err) 2820 2821 for i := 0; i < 10000; i++ { 2822 _, _ = sub.Fetch(1, nats.MaxWait(time.Microsecond)) 2823 } 2824 2825 // Needs to be at least 10 seconds, otherwise we won't hit the 2826 // minSnapDelta that prevents us from snapshotting too often 2827 time.Sleep(time.Second * 11) 2828 2829 for i := 0; i < 1024; i++ { 2830 _, _ = sub.Fetch(1, nats.MaxWait(time.Microsecond)) 2831 } 2832 2833 time.Sleep(time.Second) 2834 2835 server := c.randomNonConsumerLeader(globalAccountName, "TEST", "durable") 2836 2837 stream, err := server.globalAccount().lookupStream("TEST") 2838 require_NoError(t, err) 2839 2840 consumer := stream.lookupConsumer("durable") 2841 require_NotNil(t, consumer) 2842 2843 entries, bytes := consumer.raftNode().Size() 2844 t.Log("new entries:", entries) 2845 t.Log("new bytes:", bytes) 2846 2847 if max := uint64(1024); entries > max { 2848 t.Fatalf("got %d entries, expected less than %d entries", entries, max) 2849 } 2850 } 2851 2852 // Found in https://github.com/nats-io/nats-server/issues/3848 2853 // When Max Age was specified and stream was scaled up, new replicas 2854 // were expiring messages much later than the leader. 2855 func TestJetStreamClusterStreamMaxAgeScaleUp(t *testing.T) { 2856 c := createJetStreamClusterExplicit(t, "R3S", 3) 2857 defer c.shutdown() 2858 2859 nc, js := jsClientConnect(t, c.randomServer()) 2860 defer nc.Close() 2861 2862 for _, test := range []struct { 2863 name string 2864 storage nats.StorageType 2865 stream string 2866 purge bool 2867 }{ 2868 {name: "file", storage: nats.FileStorage, stream: "A", purge: false}, 2869 {name: "memory", storage: nats.MemoryStorage, stream: "B", purge: false}, 2870 {name: "file with purge", storage: nats.FileStorage, stream: "C", purge: true}, 2871 {name: "memory with purge", storage: nats.MemoryStorage, stream: "D", purge: true}, 2872 } { 2873 2874 t.Run(test.name, func(t *testing.T) { 2875 ttl := time.Second * 5 2876 // Add stream with one replica and short MaxAge. 2877 _, err := js.AddStream(&nats.StreamConfig{ 2878 Name: test.stream, 2879 Replicas: 1, 2880 Subjects: []string{test.stream}, 2881 MaxAge: ttl, 2882 Storage: test.storage, 2883 }) 2884 require_NoError(t, err) 2885 2886 // Add some messages. 2887 for i := 0; i < 10; i++ { 2888 sendStreamMsg(t, nc, test.stream, "HELLO") 2889 } 2890 // We need to also test if we properly set expiry 2891 // if first sequence is not 1. 2892 if test.purge { 2893 err = js.PurgeStream(test.stream) 2894 require_NoError(t, err) 2895 // Add some messages. 2896 for i := 0; i < 10; i++ { 2897 sendStreamMsg(t, nc, test.stream, "HELLO") 2898 } 2899 } 2900 // Mark the time when all messages were published. 2901 start := time.Now() 2902 2903 // Sleep for half of the MaxAge time. 2904 time.Sleep(ttl / 2) 2905 2906 // Scale up the Stream to 3 replicas. 2907 _, err = js.UpdateStream(&nats.StreamConfig{ 2908 Name: test.stream, 2909 Replicas: 3, 2910 Subjects: []string{test.stream}, 2911 MaxAge: ttl, 2912 Storage: test.storage, 2913 }) 2914 require_NoError(t, err) 2915 2916 // All messages should still be there. 2917 info, err := js.StreamInfo(test.stream) 2918 require_NoError(t, err) 2919 require_True(t, info.State.Msgs == 10) 2920 2921 // Wait until MaxAge is reached. 2922 time.Sleep(ttl - time.Since(start) + (1 * time.Second)) 2923 2924 // Check if all messages are expired. 2925 info, err = js.StreamInfo(test.stream) 2926 require_NoError(t, err) 2927 require_True(t, info.State.Msgs == 0) 2928 2929 // Now switch leader to one of replicas 2930 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, test.stream), nil, time.Second) 2931 require_NoError(t, err) 2932 c.waitOnStreamLeader("$G", test.stream) 2933 2934 // and make sure that it also expired all messages 2935 info, err = js.StreamInfo(test.stream) 2936 require_NoError(t, err) 2937 require_True(t, info.State.Msgs == 0) 2938 }) 2939 } 2940 } 2941 2942 func TestJetStreamClusterWorkQueueConsumerReplicatedAfterScaleUp(t *testing.T) { 2943 c := createJetStreamClusterExplicit(t, "R3S", 3) 2944 defer c.shutdown() 2945 2946 nc, js := jsClientConnect(t, c.randomServer()) 2947 defer nc.Close() 2948 2949 _, err := js.AddStream(&nats.StreamConfig{ 2950 Name: "TEST", 2951 Replicas: 1, 2952 Subjects: []string{"WQ"}, 2953 Retention: nats.WorkQueuePolicy, 2954 }) 2955 require_NoError(t, err) 2956 2957 // Create an ephemeral consumer. 2958 sub, err := js.SubscribeSync("WQ") 2959 require_NoError(t, err) 2960 2961 // Scale up to R3. 2962 _, err = js.UpdateStream(&nats.StreamConfig{ 2963 Name: "TEST", 2964 Replicas: 3, 2965 Subjects: []string{"WQ"}, 2966 Retention: nats.WorkQueuePolicy, 2967 }) 2968 require_NoError(t, err) 2969 c.waitOnStreamLeader(globalAccountName, "TEST") 2970 2971 ci, err := sub.ConsumerInfo() 2972 require_NoError(t, err) 2973 2974 require_True(t, ci.Config.Replicas == 0 || ci.Config.Replicas == 3) 2975 2976 c.waitOnConsumerLeader(globalAccountName, "TEST", ci.Name) 2977 s := c.consumerLeader(globalAccountName, "TEST", ci.Name) 2978 require_NotNil(t, s) 2979 2980 mset, err := s.GlobalAccount().lookupStream("TEST") 2981 require_NoError(t, err) 2982 2983 o := mset.lookupConsumer(ci.Name) 2984 require_NotNil(t, o) 2985 require_NotNil(t, o.raftNode()) 2986 } 2987 2988 // https://github.com/nats-io/nats-server/issues/3953 2989 func TestJetStreamClusterWorkQueueAfterScaleUp(t *testing.T) { 2990 c := createJetStreamClusterExplicit(t, "R3S", 3) 2991 defer c.shutdown() 2992 2993 nc, js := jsClientConnect(t, c.randomServer()) 2994 defer nc.Close() 2995 2996 _, err := js.AddStream(&nats.StreamConfig{ 2997 Name: "TEST", 2998 Replicas: 1, 2999 Subjects: []string{"WQ"}, 3000 Retention: nats.WorkQueuePolicy, 3001 }) 3002 require_NoError(t, err) 3003 3004 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 3005 Durable: "d1", 3006 DeliverSubject: "d1", 3007 AckPolicy: nats.AckExplicitPolicy, 3008 }) 3009 require_NoError(t, err) 3010 3011 wch := make(chan bool, 1) 3012 _, err = nc.Subscribe("d1", func(msg *nats.Msg) { 3013 msg.AckSync() 3014 wch <- true 3015 }) 3016 require_NoError(t, err) 3017 3018 _, err = js.UpdateStream(&nats.StreamConfig{ 3019 Name: "TEST", 3020 Replicas: 3, 3021 Subjects: []string{"WQ"}, 3022 Retention: nats.WorkQueuePolicy, 3023 }) 3024 require_NoError(t, err) 3025 c.waitOnStreamLeader(globalAccountName, "TEST") 3026 3027 sendStreamMsg(t, nc, "WQ", "SOME WORK") 3028 <-wch 3029 3030 checkFor(t, time.Second, 200*time.Millisecond, func() error { 3031 si, err := js.StreamInfo("TEST") 3032 require_NoError(t, err) 3033 if si.State.Msgs == 0 { 3034 return nil 3035 } 3036 return fmt.Errorf("Still have %d msgs left", si.State.Msgs) 3037 }) 3038 } 3039 3040 func TestJetStreamClusterInterestBasedStreamAndConsumerSnapshots(t *testing.T) { 3041 c := createJetStreamClusterExplicit(t, "R3S", 3) 3042 defer c.shutdown() 3043 3044 nc, js := jsClientConnect(t, c.randomServer()) 3045 defer nc.Close() 3046 3047 _, err := js.AddStream(&nats.StreamConfig{ 3048 Name: "TEST", 3049 Replicas: 3, 3050 Subjects: []string{"foo"}, 3051 Retention: nats.InterestPolicy, 3052 }) 3053 require_NoError(t, err) 3054 3055 sub, err := js.SubscribeSync("foo", nats.Durable("d22")) 3056 require_NoError(t, err) 3057 3058 num := 200 3059 for i := 0; i < num; i++ { 3060 js.PublishAsync("foo", []byte("ok")) 3061 } 3062 select { 3063 case <-js.PublishAsyncComplete(): 3064 case <-time.After(5 * time.Second): 3065 t.Fatalf("Did not receive completion signal") 3066 } 3067 3068 checkSubsPending(t, sub, num) 3069 3070 // Shutdown one server. 3071 s := c.randomServer() 3072 s.Shutdown() 3073 3074 c.waitOnStreamLeader(globalAccountName, "TEST") 3075 3076 nc, js = jsClientConnect(t, c.randomServer()) 3077 defer nc.Close() 3078 3079 // Now ack all messages while the other server is down. 3080 for i := 0; i < num; i++ { 3081 m, err := sub.NextMsg(time.Second) 3082 require_NoError(t, err) 3083 m.AckSync() 3084 } 3085 3086 // Wait for all message acks to be processed and all messages to be removed. 3087 checkFor(t, time.Second, 200*time.Millisecond, func() error { 3088 si, err := js.StreamInfo("TEST") 3089 require_NoError(t, err) 3090 if si.State.Msgs == 0 { 3091 return nil 3092 } 3093 return fmt.Errorf("Still have %d msgs left", si.State.Msgs) 3094 }) 3095 3096 // Force a snapshot on the consumer leader before restarting the downed server. 3097 cl := c.consumerLeader(globalAccountName, "TEST", "d22") 3098 require_NotNil(t, cl) 3099 3100 mset, err := cl.GlobalAccount().lookupStream("TEST") 3101 require_NoError(t, err) 3102 3103 o := mset.lookupConsumer("d22") 3104 require_NotNil(t, o) 3105 3106 snap, err := o.store.EncodedState() 3107 require_NoError(t, err) 3108 3109 n := o.raftNode() 3110 require_NotNil(t, n) 3111 require_NoError(t, n.InstallSnapshot(snap)) 3112 3113 // Now restart the downed server. 3114 s = c.restartServer(s) 3115 3116 // Make the restarted server the eventual leader. 3117 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 3118 c.waitOnStreamLeader(globalAccountName, "TEST") 3119 if sl := c.streamLeader(globalAccountName, "TEST"); sl != s { 3120 sl.JetStreamStepdownStream(globalAccountName, "TEST") 3121 return fmt.Errorf("Server %s is not leader yet", s) 3122 } 3123 return nil 3124 }) 3125 3126 si, err := js.StreamInfo("TEST") 3127 require_NoError(t, err) 3128 require_True(t, si.State.Msgs == 0) 3129 } 3130 3131 func TestJetStreamClusterConsumerFollowerStoreStateAckFloorBug(t *testing.T) { 3132 c := createJetStreamClusterExplicit(t, "R3S", 3) 3133 defer c.shutdown() 3134 3135 nc, js := jsClientConnect(t, c.randomServer()) 3136 defer nc.Close() 3137 3138 _, err := js.AddStream(&nats.StreamConfig{ 3139 Name: "TEST", 3140 Replicas: 3, 3141 Subjects: []string{"foo"}, 3142 }) 3143 require_NoError(t, err) 3144 3145 sub, err := js.PullSubscribe(_EMPTY_, "C", nats.BindStream("TEST"), nats.ManualAck()) 3146 require_NoError(t, err) 3147 3148 num := 100 3149 for i := 0; i < num; i++ { 3150 sendStreamMsg(t, nc, "foo", "data") 3151 } 3152 3153 // This one prevents the state for pending from reaching 0 and resetting, which would not show the bug. 3154 sendStreamMsg(t, nc, "foo", "data") 3155 3156 // Ack all but one and out of order and make sure all consumers have the same stored state. 3157 msgs, err := sub.Fetch(num, nats.MaxWait(time.Second)) 3158 require_NoError(t, err) 3159 require_True(t, len(msgs) == num) 3160 3161 _, err = sub.Fetch(1, nats.MaxWait(time.Second)) 3162 require_NoError(t, err) 3163 3164 rand.Shuffle(len(msgs), func(i, j int) { msgs[i], msgs[j] = msgs[j], msgs[i] }) 3165 for _, m := range msgs { 3166 if err := m.AckSync(); err != nil { 3167 t.Fatalf("Ack failed :%+v", err) 3168 } 3169 } 3170 3171 checkConsumerState := func(delivered, ackFloor nats.SequenceInfo, numAckPending int) error { 3172 expectedDelivered := uint64(num) + 1 3173 if delivered.Stream != expectedDelivered || delivered.Consumer != expectedDelivered { 3174 return fmt.Errorf("Wrong delivered, expected %d got %+v", expectedDelivered, delivered) 3175 } 3176 expectedAck := uint64(num) 3177 if ackFloor.Stream != expectedAck || ackFloor.Consumer != expectedAck { 3178 return fmt.Errorf("Wrong ackFloor, expected %d got %+v", expectedAck, ackFloor) 3179 } 3180 if numAckPending != 1 { 3181 return errors.New("Expected num ack pending to be 1") 3182 } 3183 return nil 3184 } 3185 3186 ci, err := js.ConsumerInfo("TEST", "C") 3187 require_NoError(t, err) 3188 require_NoError(t, checkConsumerState(ci.Delivered, ci.AckFloor, ci.NumAckPending)) 3189 3190 // Check each consumer on each server for it's store state and make sure it matches as well. 3191 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 3192 for _, s := range c.servers { 3193 mset, err := s.GlobalAccount().lookupStream("TEST") 3194 if err != nil { 3195 return err 3196 } 3197 if mset == nil { 3198 return errors.New("Mset should not be nil") 3199 } 3200 o := mset.lookupConsumer("C") 3201 if o == nil { 3202 return errors.New("Consumer should not be nil") 3203 } 3204 3205 state, err := o.store.State() 3206 if err != nil { 3207 return err 3208 } 3209 delivered := nats.SequenceInfo{Stream: state.Delivered.Stream, Consumer: state.Delivered.Consumer} 3210 ackFloor := nats.SequenceInfo{Stream: state.AckFloor.Stream, Consumer: state.AckFloor.Consumer} 3211 if err := checkConsumerState(delivered, ackFloor, len(state.Pending)); err != nil { 3212 return err 3213 } 3214 } 3215 return nil 3216 }) 3217 3218 // Now stepdown the consumer and move its leader and check the state after transition. 3219 // Make the restarted server the eventual leader. 3220 seen := make(map[*Server]bool) 3221 cl := c.consumerLeader(globalAccountName, "TEST", "C") 3222 require_NotNil(t, cl) 3223 seen[cl] = true 3224 3225 allSeen := func() bool { 3226 for _, s := range c.servers { 3227 if !seen[s] { 3228 return false 3229 } 3230 } 3231 return true 3232 } 3233 3234 checkAllLeaders := func() { 3235 t.Helper() 3236 checkFor(t, 20*time.Second, 200*time.Millisecond, func() error { 3237 c.waitOnConsumerLeader(globalAccountName, "TEST", "C") 3238 if allSeen() { 3239 return nil 3240 } 3241 cl := c.consumerLeader(globalAccountName, "TEST", "C") 3242 seen[cl] = true 3243 ci, err := js.ConsumerInfo("TEST", "C") 3244 if err != nil { 3245 return err 3246 } 3247 if err := checkConsumerState(ci.Delivered, ci.AckFloor, ci.NumAckPending); err != nil { 3248 return err 3249 } 3250 cl.JetStreamStepdownConsumer(globalAccountName, "TEST", "C") 3251 return fmt.Errorf("Not all servers have been consumer leader yet") 3252 }) 3253 } 3254 3255 checkAllLeaders() 3256 3257 // No restart all servers and check again. 3258 c.stopAll() 3259 c.restartAll() 3260 c.waitOnLeader() 3261 3262 nc, js = jsClientConnect(t, c.randomServer()) 3263 defer nc.Close() 3264 3265 seen = make(map[*Server]bool) 3266 checkAllLeaders() 3267 } 3268 3269 func TestJetStreamClusterInterestLeakOnDisableJetStream(t *testing.T) { 3270 c := createJetStreamClusterExplicit(t, "R3S", 3) 3271 defer c.shutdown() 3272 3273 nc, js := jsClientConnect(t, c.leader()) 3274 defer nc.Close() 3275 3276 for i := 1; i <= 5; i++ { 3277 _, err := js.AddStream(&nats.StreamConfig{ 3278 Name: fmt.Sprintf("test_%d", i), 3279 Subjects: []string{fmt.Sprintf("test_%d", i)}, 3280 Replicas: 3, 3281 }) 3282 require_NoError(t, err) 3283 } 3284 3285 c.waitOnAllCurrent() 3286 3287 server := c.randomNonLeader() 3288 account := server.SystemAccount() 3289 3290 server.DisableJetStream() 3291 3292 var sublist []*subscription 3293 account.sl.localSubs(&sublist, false) 3294 3295 var danglingJSC, danglingRaft int 3296 for _, sub := range sublist { 3297 if strings.HasPrefix(string(sub.subject), "$JSC.") { 3298 danglingJSC++ 3299 } else if strings.HasPrefix(string(sub.subject), "$NRG.") { 3300 danglingRaft++ 3301 } 3302 } 3303 if danglingJSC > 0 || danglingRaft > 0 { 3304 t.Fatalf("unexpected dangling interests for JetStream assets after shutdown (%d $JSC, %d $NRG)", danglingJSC, danglingRaft) 3305 } 3306 } 3307 3308 func TestJetStreamClusterNoLeadersDuringLameDuck(t *testing.T) { 3309 c := createJetStreamClusterExplicit(t, "R3S", 3) 3310 defer c.shutdown() 3311 3312 // Grab the first server and set lameduck option directly. 3313 s := c.servers[0] 3314 s.optsMu.Lock() 3315 s.opts.LameDuckDuration = 5 * time.Second 3316 s.opts.LameDuckGracePeriod = -5 * time.Second 3317 s.optsMu.Unlock() 3318 3319 // Connect to the third server. 3320 nc, js := jsClientConnect(t, c.servers[2]) 3321 defer nc.Close() 3322 3323 allServersHaveLeaders := func() bool { 3324 haveLeader := make(map[*Server]bool) 3325 for _, s := range c.servers { 3326 s.rnMu.RLock() 3327 for _, n := range s.raftNodes { 3328 if n.Leader() { 3329 haveLeader[s] = true 3330 break 3331 } 3332 } 3333 s.rnMu.RUnlock() 3334 } 3335 return len(haveLeader) == len(c.servers) 3336 } 3337 3338 // Create streams until we have a leader on all the servers. 3339 var index int 3340 checkFor(t, 10*time.Second, time.Millisecond, func() error { 3341 if allServersHaveLeaders() { 3342 return nil 3343 } 3344 index++ 3345 _, err := js.AddStream(&nats.StreamConfig{ 3346 Name: fmt.Sprintf("TEST_%d", index), 3347 Subjects: []string{fmt.Sprintf("foo.%d", index)}, 3348 Replicas: 3, 3349 }) 3350 require_NoError(t, err) 3351 return fmt.Errorf("All servers do not have at least one leader") 3352 }) 3353 3354 // Put our server into lameduck mode. 3355 // Need a client. 3356 dummy, _ := jsClientConnect(t, s) 3357 defer dummy.Close() 3358 go s.lameDuckMode() 3359 3360 // Wait for all leaders to move off. 3361 checkFor(t, 2*time.Second, 50*time.Millisecond, func() error { 3362 s.rnMu.RLock() 3363 defer s.rnMu.RUnlock() 3364 for _, n := range s.raftNodes { 3365 if n.Leader() { 3366 return fmt.Errorf("Server still has a leader") 3367 } 3368 } 3369 return nil 3370 }) 3371 3372 // All leader evacuated. 3373 3374 // Create a go routine that will create streams constantly. 3375 qch := make(chan bool) 3376 go func() { 3377 var index int 3378 for { 3379 select { 3380 case <-time.After(time.Millisecond): 3381 index++ 3382 _, err := js.AddStream(&nats.StreamConfig{ 3383 Name: fmt.Sprintf("NEW_TEST_%d", index), 3384 Subjects: []string{fmt.Sprintf("bar.%d", index)}, 3385 Replicas: 3, 3386 }) 3387 if err != nil { 3388 return 3389 } 3390 case <-qch: 3391 return 3392 } 3393 } 3394 }() 3395 defer close(qch) 3396 3397 // Make sure we do not have any leaders placed on the lameduck server. 3398 for s.isRunning() { 3399 var hasLeader bool 3400 s.rnMu.RLock() 3401 for _, n := range s.raftNodes { 3402 hasLeader = hasLeader || n.Leader() 3403 } 3404 s.rnMu.RUnlock() 3405 if hasLeader { 3406 t.Fatalf("Server had a leader when it should not due to lameduck mode") 3407 } 3408 } 3409 } 3410 3411 func TestJetStreamClusterNoR1AssetsDuringLameDuck(t *testing.T) { 3412 c := createJetStreamClusterExplicit(t, "R3S", 3) 3413 defer c.shutdown() 3414 3415 // Grab the first server and set lameduck option directly. 3416 s := c.servers[0] 3417 s.optsMu.Lock() 3418 s.opts.LameDuckDuration = 5 * time.Second 3419 s.opts.LameDuckGracePeriod = -5 * time.Second 3420 s.optsMu.Unlock() 3421 3422 // Connect to the server to keep it alive when we go into LDM. 3423 dummy, _ := jsClientConnect(t, s) 3424 defer dummy.Close() 3425 3426 // Connect to the third server. 3427 nc, js := jsClientConnect(t, c.servers[2]) 3428 defer nc.Close() 3429 3430 // Now put the first server into lame duck mode. 3431 go s.lameDuckMode() 3432 3433 // Wait for news to arrive that the first server has gone into 3434 // lame duck mode and been marked offline. 3435 checkFor(t, 2*time.Second, 50*time.Millisecond, func() error { 3436 id := s.info.ID 3437 s := c.servers[2] 3438 s.mu.RLock() 3439 defer s.mu.RUnlock() 3440 3441 var isOffline bool 3442 s.nodeToInfo.Range(func(_, v any) bool { 3443 ni := v.(nodeInfo) 3444 if ni.id == id { 3445 isOffline = ni.offline 3446 return false 3447 } 3448 return true 3449 }) 3450 3451 if !isOffline { 3452 return fmt.Errorf("first node is still online unexpectedly") 3453 } 3454 return nil 3455 }) 3456 3457 // Create a go routine that will create streams constantly. 3458 qch := make(chan bool) 3459 go func() { 3460 var index int 3461 for { 3462 select { 3463 case <-time.After(time.Millisecond * 25): 3464 index++ 3465 _, err := js.AddStream(&nats.StreamConfig{ 3466 Name: fmt.Sprintf("NEW_TEST_%d", index), 3467 Subjects: []string{fmt.Sprintf("bar.%d", index)}, 3468 Replicas: 1, 3469 }) 3470 if err != nil { 3471 return 3472 } 3473 case <-qch: 3474 return 3475 } 3476 } 3477 }() 3478 defer close(qch) 3479 3480 gacc := s.GlobalAccount() 3481 if gacc == nil { 3482 t.Fatalf("No global account") 3483 } 3484 // Make sure we do not have any R1 assets placed on the lameduck server. 3485 for s.isRunning() { 3486 if len(gacc.streams()) > 0 { 3487 t.Fatalf("Server had an R1 asset when it should not due to lameduck mode") 3488 } 3489 time.Sleep(15 * time.Millisecond) 3490 } 3491 s.WaitForShutdown() 3492 } 3493 3494 // If a consumer has not been registered (possible in heavily loaded systems with lots of assets) 3495 // it could miss the signal of a message going away. If that message was pending and expires the 3496 // ack floor could fall below the stream first sequence. This test will force that condition and 3497 // make sure the system resolves itself. 3498 func TestJetStreamClusterConsumerAckFloorDrift(t *testing.T) { 3499 c := createJetStreamClusterExplicit(t, "R3S", 3) 3500 defer c.shutdown() 3501 3502 nc, js := jsClientConnect(t, c.randomServer()) 3503 defer nc.Close() 3504 3505 _, err := js.AddStream(&nats.StreamConfig{ 3506 Name: "TEST", 3507 Subjects: []string{"*"}, 3508 Replicas: 3, 3509 MaxAge: time.Second, 3510 MaxMsgs: 10, 3511 }) 3512 require_NoError(t, err) 3513 3514 sub, err := js.PullSubscribe("foo", "C") 3515 require_NoError(t, err) 3516 3517 for i := 0; i < 10; i++ { 3518 sendStreamMsg(t, nc, "foo", "HELLO") 3519 } 3520 3521 // No-op but will surface as delivered. 3522 _, err = sub.Fetch(10) 3523 require_NoError(t, err) 3524 3525 // We will grab the state with delivered being 10 and ackfloor being 0 directly. 3526 cl := c.consumerLeader(globalAccountName, "TEST", "C") 3527 require_NotNil(t, cl) 3528 3529 mset, err := cl.GlobalAccount().lookupStream("TEST") 3530 require_NoError(t, err) 3531 o := mset.lookupConsumer("C") 3532 require_NotNil(t, o) 3533 o.mu.RLock() 3534 state, err := o.store.State() 3535 o.mu.RUnlock() 3536 require_NoError(t, err) 3537 require_NotNil(t, state) 3538 3539 // Now let messages expire. 3540 checkFor(t, 5*time.Second, time.Second, func() error { 3541 si, err := js.StreamInfo("TEST") 3542 require_NoError(t, err) 3543 if si.State.Msgs == 0 { 3544 return nil 3545 } 3546 return fmt.Errorf("stream still has msgs") 3547 }) 3548 3549 // Set state to ackfloor of 5 and no pending. 3550 state.AckFloor.Consumer = 5 3551 state.AckFloor.Stream = 5 3552 state.Pending = nil 3553 3554 // Now put back the state underneath of the consumers. 3555 for _, s := range c.servers { 3556 mset, err := s.GlobalAccount().lookupStream("TEST") 3557 require_NoError(t, err) 3558 o := mset.lookupConsumer("C") 3559 require_NotNil(t, o) 3560 o.mu.Lock() 3561 err = o.setStoreState(state) 3562 cfs := o.store.(*consumerFileStore) 3563 o.mu.Unlock() 3564 require_NoError(t, err) 3565 // The lower layer will ignore, so set more directly. 3566 cfs.mu.Lock() 3567 cfs.state = *state 3568 cfs.mu.Unlock() 3569 // Also snapshot to remove any raft entries that could affect it. 3570 snap, err := o.store.EncodedState() 3571 require_NoError(t, err) 3572 require_NoError(t, o.raftNode().InstallSnapshot(snap)) 3573 } 3574 3575 cl.JetStreamStepdownConsumer(globalAccountName, "TEST", "C") 3576 c.waitOnConsumerLeader(globalAccountName, "TEST", "C") 3577 3578 checkFor(t, 5*time.Second, 100*time.Millisecond, func() error { 3579 ci, err := js.ConsumerInfo("TEST", "C") 3580 require_NoError(t, err) 3581 // Make sure we catch this and adjust. 3582 if ci.AckFloor.Stream == 10 && ci.AckFloor.Consumer == 10 { 3583 return nil 3584 } 3585 return fmt.Errorf("AckFloor not correct, expected 10, got %+v", ci.AckFloor) 3586 }) 3587 } 3588 3589 func TestJetStreamClusterInterestStreamFilteredConsumersWithNoInterest(t *testing.T) { 3590 c := createJetStreamClusterExplicit(t, "R5S", 5) 3591 defer c.shutdown() 3592 3593 nc, js := jsClientConnect(t, c.randomServer()) 3594 defer nc.Close() 3595 3596 _, err := js.AddStream(&nats.StreamConfig{ 3597 Name: "TEST", 3598 Subjects: []string{"*"}, 3599 Retention: nats.InterestPolicy, 3600 Replicas: 3, 3601 }) 3602 require_NoError(t, err) 3603 3604 // Create three subscribers. 3605 ackCb := func(m *nats.Msg) { m.Ack() } 3606 3607 _, err = js.Subscribe("foo", ackCb, nats.BindStream("TEST"), nats.ManualAck()) 3608 require_NoError(t, err) 3609 3610 _, err = js.Subscribe("bar", ackCb, nats.BindStream("TEST"), nats.ManualAck()) 3611 require_NoError(t, err) 3612 3613 _, err = js.Subscribe("baz", ackCb, nats.BindStream("TEST"), nats.ManualAck()) 3614 require_NoError(t, err) 3615 3616 // Now send 100 messages, randomly picking foo or bar, but never baz. 3617 for i := 0; i < 100; i++ { 3618 if rand.Intn(2) > 0 { 3619 sendStreamMsg(t, nc, "foo", "HELLO") 3620 } else { 3621 sendStreamMsg(t, nc, "bar", "WORLD") 3622 } 3623 } 3624 3625 // Messages are expected to go to 0. 3626 checkFor(t, time.Second, 100*time.Millisecond, func() error { 3627 si, err := js.StreamInfo("TEST") 3628 require_NoError(t, err) 3629 if si.State.Msgs == 0 { 3630 return nil 3631 } 3632 return fmt.Errorf("stream still has msgs") 3633 }) 3634 } 3635 3636 func TestJetStreamClusterChangeClusterAfterStreamCreate(t *testing.T) { 3637 c := createJetStreamClusterExplicit(t, "NATS", 3) 3638 defer c.shutdown() 3639 3640 nc, js := jsClientConnect(t, c.randomServer()) 3641 defer nc.Close() 3642 3643 _, err := js.AddStream(&nats.StreamConfig{ 3644 Name: "TEST", 3645 Subjects: []string{"*"}, 3646 Replicas: 3, 3647 }) 3648 require_NoError(t, err) 3649 3650 for i := 0; i < 1000; i++ { 3651 sendStreamMsg(t, nc, "foo", "HELLO") 3652 } 3653 3654 _, err = js.UpdateStream(&nats.StreamConfig{ 3655 Name: "TEST", 3656 Subjects: []string{"*"}, 3657 Replicas: 1, 3658 }) 3659 require_NoError(t, err) 3660 3661 c.stopAll() 3662 3663 c.name = "FOO" 3664 for _, o := range c.opts { 3665 buf, err := os.ReadFile(o.ConfigFile) 3666 require_NoError(t, err) 3667 nbuf := bytes.Replace(buf, []byte("name: NATS"), []byte("name: FOO"), 1) 3668 err = os.WriteFile(o.ConfigFile, nbuf, 0640) 3669 require_NoError(t, err) 3670 } 3671 3672 c.restartAll() 3673 c.waitOnLeader() 3674 c.waitOnStreamLeader(globalAccountName, "TEST") 3675 3676 nc, js = jsClientConnect(t, c.randomServer()) 3677 defer nc.Close() 3678 3679 _, err = js.UpdateStream(&nats.StreamConfig{ 3680 Name: "TEST", 3681 Subjects: []string{"*"}, 3682 Replicas: 3, 3683 }) 3684 // This should fail with no suitable peers, since the asset was created under the NATS cluster which has no peers. 3685 require_Error(t, err, errors.New("nats: no suitable peers for placement")) 3686 3687 // Make sure we can swap the cluster. 3688 _, err = js.UpdateStream(&nats.StreamConfig{ 3689 Name: "TEST", 3690 Subjects: []string{"*"}, 3691 Placement: &nats.Placement{Cluster: "FOO"}, 3692 }) 3693 require_NoError(t, err) 3694 } 3695 3696 // The consumer info() call does not take into account whether a consumer 3697 // is a leader or not, so results would be very different when asking servers 3698 // that housed consumer followers vs leaders. 3699 func TestJetStreamClusterConsumerInfoForJszForFollowers(t *testing.T) { 3700 c := createJetStreamClusterExplicit(t, "NATS", 3) 3701 defer c.shutdown() 3702 3703 nc, js := jsClientConnect(t, c.randomServer()) 3704 defer nc.Close() 3705 3706 _, err := js.AddStream(&nats.StreamConfig{ 3707 Name: "TEST", 3708 Subjects: []string{"*"}, 3709 Replicas: 3, 3710 }) 3711 require_NoError(t, err) 3712 3713 for i := 0; i < 1000; i++ { 3714 sendStreamMsg(t, nc, "foo", "HELLO") 3715 } 3716 3717 sub, err := js.PullSubscribe("foo", "d") 3718 require_NoError(t, err) 3719 3720 fetch, ack := 122, 22 3721 msgs, err := sub.Fetch(fetch, nats.MaxWait(10*time.Second)) 3722 require_NoError(t, err) 3723 require_True(t, len(msgs) == fetch) 3724 for _, m := range msgs[:ack] { 3725 m.AckSync() 3726 } 3727 // Let acks propagate. 3728 time.Sleep(100 * time.Millisecond) 3729 3730 for _, s := range c.servers { 3731 jsz, err := s.Jsz(&JSzOptions{Accounts: true, Consumer: true}) 3732 require_NoError(t, err) 3733 require_True(t, len(jsz.AccountDetails) == 1) 3734 require_True(t, len(jsz.AccountDetails[0].Streams) == 1) 3735 require_True(t, len(jsz.AccountDetails[0].Streams[0].Consumer) == 1) 3736 consumer := jsz.AccountDetails[0].Streams[0].Consumer[0] 3737 if consumer.Delivered.Consumer != uint64(fetch) || consumer.Delivered.Stream != uint64(fetch) { 3738 t.Fatalf("Incorrect delivered for %v: %+v", s, consumer.Delivered) 3739 } 3740 if consumer.AckFloor.Consumer != uint64(ack) || consumer.AckFloor.Stream != uint64(ack) { 3741 t.Fatalf("Incorrect ackfloor for %v: %+v", s, consumer.AckFloor) 3742 } 3743 } 3744 } 3745 3746 // Under certain scenarios we have seen consumers become stopped and cause healthz to fail. 3747 // The specific scneario is heavy loads, and stream resets on upgrades that could orphan consumers. 3748 func TestJetStreamClusterHealthzCheckForStoppedAssets(t *testing.T) { 3749 c := createJetStreamClusterExplicit(t, "NATS", 3) 3750 defer c.shutdown() 3751 3752 nc, js := jsClientConnect(t, c.randomServer()) 3753 defer nc.Close() 3754 3755 _, err := js.AddStream(&nats.StreamConfig{ 3756 Name: "TEST", 3757 Subjects: []string{"*"}, 3758 Replicas: 3, 3759 }) 3760 require_NoError(t, err) 3761 3762 for i := 0; i < 1000; i++ { 3763 sendStreamMsg(t, nc, "foo", "HELLO") 3764 } 3765 3766 sub, err := js.PullSubscribe("foo", "d") 3767 require_NoError(t, err) 3768 3769 fetch, ack := 122, 22 3770 msgs, err := sub.Fetch(fetch, nats.MaxWait(10*time.Second)) 3771 require_NoError(t, err) 3772 require_True(t, len(msgs) == fetch) 3773 for _, m := range msgs[:ack] { 3774 m.AckSync() 3775 } 3776 // Let acks propagate. 3777 time.Sleep(100 * time.Millisecond) 3778 3779 // We will now stop a stream on a given server. 3780 s := c.randomServer() 3781 mset, err := s.GlobalAccount().lookupStream("TEST") 3782 require_NoError(t, err) 3783 // Stop the stream 3784 mset.stop(false, false) 3785 3786 // Wait for exit. 3787 time.Sleep(100 * time.Millisecond) 3788 3789 checkFor(t, 15*time.Second, 500*time.Millisecond, func() error { 3790 hs := s.healthz(nil) 3791 if hs.Error != _EMPTY_ { 3792 return errors.New(hs.Error) 3793 } 3794 return nil 3795 }) 3796 3797 // Now take out the consumer. 3798 mset, err = s.GlobalAccount().lookupStream("TEST") 3799 require_NoError(t, err) 3800 3801 o := mset.lookupConsumer("d") 3802 require_NotNil(t, o) 3803 3804 o.stop() 3805 // Wait for exit. 3806 time.Sleep(100 * time.Millisecond) 3807 3808 checkFor(t, 5*time.Second, 500*time.Millisecond, func() error { 3809 hs := s.healthz(nil) 3810 if hs.Error != _EMPTY_ { 3811 return errors.New(hs.Error) 3812 } 3813 return nil 3814 }) 3815 3816 // Now just stop the raft node from underneath the consumer. 3817 o = mset.lookupConsumer("d") 3818 require_NotNil(t, o) 3819 node := o.raftNode() 3820 require_NotNil(t, node) 3821 node.Stop() 3822 3823 checkFor(t, 5*time.Second, 500*time.Millisecond, func() error { 3824 hs := s.healthz(nil) 3825 if hs.Error != _EMPTY_ { 3826 return errors.New(hs.Error) 3827 } 3828 return nil 3829 }) 3830 } 3831 3832 // Make sure that stopping a stream shutdowns down it's raft node. 3833 func TestJetStreamClusterStreamNodeShutdownBugOnStop(t *testing.T) { 3834 c := createJetStreamClusterExplicit(t, "NATS", 3) 3835 defer c.shutdown() 3836 3837 nc, js := jsClientConnect(t, c.randomServer()) 3838 defer nc.Close() 3839 3840 _, err := js.AddStream(&nats.StreamConfig{ 3841 Name: "TEST", 3842 Subjects: []string{"*"}, 3843 Replicas: 3, 3844 }) 3845 require_NoError(t, err) 3846 3847 for i := 0; i < 100; i++ { 3848 sendStreamMsg(t, nc, "foo", "HELLO") 3849 } 3850 3851 s := c.randomServer() 3852 numNodesStart := s.numRaftNodes() 3853 mset, err := s.GlobalAccount().lookupStream("TEST") 3854 require_NoError(t, err) 3855 node := mset.raftNode() 3856 require_NotNil(t, node) 3857 node.InstallSnapshot(mset.stateSnapshot()) 3858 // Stop the stream 3859 mset.stop(false, false) 3860 3861 if numNodes := s.numRaftNodes(); numNodes != numNodesStart-1 { 3862 t.Fatalf("RAFT nodes after stream stop incorrect: %d vs %d", numNodesStart, numNodes) 3863 } 3864 } 3865 3866 func TestJetStreamClusterStreamAccountingOnStoreError(t *testing.T) { 3867 c := createJetStreamClusterWithTemplate(t, jsClusterMaxBytesAccountLimitTempl, "NATS", 3) 3868 defer c.shutdown() 3869 3870 nc, js := jsClientConnect(t, c.randomServer()) 3871 defer nc.Close() 3872 3873 _, err := js.AddStream(&nats.StreamConfig{ 3874 Name: "TEST", 3875 Subjects: []string{"*"}, 3876 MaxBytes: 1 * 1024 * 1024 * 1024, 3877 Replicas: 3, 3878 }) 3879 require_NoError(t, err) 3880 3881 msg := strings.Repeat("Z", 32*1024) 3882 for i := 0; i < 10; i++ { 3883 sendStreamMsg(t, nc, "foo", msg) 3884 } 3885 s := c.randomServer() 3886 acc, err := s.LookupAccount("$U") 3887 require_NoError(t, err) 3888 mset, err := acc.lookupStream("TEST") 3889 require_NoError(t, err) 3890 mset.mu.Lock() 3891 mset.store.Stop() 3892 sjs := mset.js 3893 mset.mu.Unlock() 3894 3895 // Now delete the stream 3896 js.DeleteStream("TEST") 3897 3898 // Wait for this to propgate. 3899 // The bug will have us not release reserved resources properly. 3900 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 3901 info, err := js.AccountInfo() 3902 require_NoError(t, err) 3903 // Default tier 3904 if info.Store != 0 { 3905 return fmt.Errorf("Expected store to be 0 but got %v", friendlyBytes(info.Store)) 3906 } 3907 return nil 3908 }) 3909 3910 // Now check js from server directly regarding reserved. 3911 sjs.mu.RLock() 3912 reserved := sjs.storeReserved 3913 sjs.mu.RUnlock() 3914 // Under bug will show 1GB 3915 if reserved != 0 { 3916 t.Fatalf("Expected store reserved to be 0 after stream delete, got %v", friendlyBytes(reserved)) 3917 } 3918 } 3919 3920 func TestJetStreamClusterStreamAccountingDriftFixups(t *testing.T) { 3921 c := createJetStreamClusterWithTemplate(t, jsClusterMaxBytesAccountLimitTempl, "NATS", 3) 3922 defer c.shutdown() 3923 3924 nc, js := jsClientConnect(t, c.randomServer()) 3925 defer nc.Close() 3926 3927 _, err := js.AddStream(&nats.StreamConfig{ 3928 Name: "TEST", 3929 Subjects: []string{"*"}, 3930 MaxBytes: 2 * 1024 * 1024, 3931 Replicas: 3, 3932 }) 3933 require_NoError(t, err) 3934 3935 msg := strings.Repeat("Z", 32*1024) 3936 for i := 0; i < 100; i++ { 3937 sendStreamMsg(t, nc, "foo", msg) 3938 } 3939 3940 err = js.PurgeStream("TEST") 3941 require_NoError(t, err) 3942 3943 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 3944 info, err := js.AccountInfo() 3945 require_NoError(t, err) 3946 if info.Store != 0 { 3947 return fmt.Errorf("Store usage not 0: %d", info.Store) 3948 } 3949 return nil 3950 }) 3951 3952 s := c.leader() 3953 jsz, err := s.Jsz(nil) 3954 require_NoError(t, err) 3955 require_True(t, jsz.JetStreamStats.Store == 0) 3956 3957 acc, err := s.LookupAccount("$U") 3958 require_NoError(t, err) 3959 mset, err := acc.lookupStream("TEST") 3960 require_NoError(t, err) 3961 mset.mu.RLock() 3962 jsa, tier, stype := mset.jsa, mset.tier, mset.stype 3963 mset.mu.RUnlock() 3964 // Drift the usage. 3965 jsa.updateUsage(tier, stype, -100) 3966 3967 checkFor(t, time.Second, 200*time.Millisecond, func() error { 3968 info, err := js.AccountInfo() 3969 require_NoError(t, err) 3970 if info.Store != 0 { 3971 return fmt.Errorf("Store usage not 0: %d", info.Store) 3972 } 3973 return nil 3974 }) 3975 jsz, err = s.Jsz(nil) 3976 require_NoError(t, err) 3977 require_True(t, jsz.JetStreamStats.Store == 0) 3978 } 3979 3980 // Some older streams seem to have been created or exist with no explicit cluster setting. 3981 // For server <= 2.9.16 you could not scale the streams up since we could not place them in another cluster. 3982 func TestJetStreamClusterStreamScaleUpNoGroupCluster(t *testing.T) { 3983 c := createJetStreamClusterExplicit(t, "NATS", 3) 3984 defer c.shutdown() 3985 3986 nc, js := jsClientConnect(t, c.randomServer()) 3987 defer nc.Close() 3988 3989 _, err := js.AddStream(&nats.StreamConfig{ 3990 Name: "TEST", 3991 Subjects: []string{"*"}, 3992 }) 3993 require_NoError(t, err) 3994 3995 // Manually going to grab stream assignment and update it to be without the group cluster. 3996 s := c.streamLeader(globalAccountName, "TEST") 3997 mset, err := s.GlobalAccount().lookupStream("TEST") 3998 require_NoError(t, err) 3999 4000 sa := mset.streamAssignment() 4001 require_NotNil(t, sa) 4002 // Make copy to not change stream's 4003 sa = sa.copyGroup() 4004 // Remove cluster and preferred. 4005 sa.Group.Cluster = _EMPTY_ 4006 sa.Group.Preferred = _EMPTY_ 4007 // Insert into meta layer. 4008 if sjs := s.getJetStream(); sjs != nil { 4009 sjs.mu.RLock() 4010 meta := sjs.cluster.meta 4011 sjs.mu.RUnlock() 4012 if meta != nil { 4013 meta.ForwardProposal(encodeUpdateStreamAssignment(sa)) 4014 } 4015 } 4016 // Make sure it got propagated.. 4017 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 4018 sa := mset.streamAssignment().copyGroup() 4019 require_NotNil(t, sa) 4020 if sa.Group.Cluster != _EMPTY_ { 4021 return fmt.Errorf("Cluster still not cleared") 4022 } 4023 return nil 4024 }) 4025 // Now we know it has been nil'd out. Make sure we can scale up. 4026 _, err = js.UpdateStream(&nats.StreamConfig{ 4027 Name: "TEST", 4028 Subjects: []string{"*"}, 4029 Replicas: 3, 4030 }) 4031 require_NoError(t, err) 4032 } 4033 4034 // https://github.com/nats-io/nats-server/issues/4162 4035 func TestJetStreamClusterStaleDirectGetOnRestart(t *testing.T) { 4036 c := createJetStreamClusterExplicit(t, "NATS", 3) 4037 defer c.shutdown() 4038 4039 nc, js := jsClientConnect(t, c.randomServer()) 4040 defer nc.Close() 4041 4042 kv, err := js.CreateKeyValue(&nats.KeyValueConfig{ 4043 Bucket: "TEST", 4044 Replicas: 3, 4045 }) 4046 require_NoError(t, err) 4047 4048 _, err = kv.PutString("foo", "bar") 4049 require_NoError(t, err) 4050 4051 // Close client in case we were connected to server below. 4052 // We will recreate. 4053 nc.Close() 4054 4055 // Shutdown a non-leader. 4056 s := c.randomNonStreamLeader(globalAccountName, "KV_TEST") 4057 s.Shutdown() 4058 4059 nc, js = jsClientConnect(t, c.randomServer()) 4060 defer nc.Close() 4061 4062 kv, err = js.KeyValue("TEST") 4063 require_NoError(t, err) 4064 4065 _, err = kv.PutString("foo", "baz") 4066 require_NoError(t, err) 4067 4068 errCh := make(chan error, 100) 4069 done := make(chan struct{}) 4070 4071 go func() { 4072 nc, js := jsClientConnect(t, c.randomServer()) 4073 defer nc.Close() 4074 4075 kv, err := js.KeyValue("TEST") 4076 if err != nil { 4077 errCh <- err 4078 return 4079 } 4080 4081 for { 4082 select { 4083 case <-done: 4084 return 4085 default: 4086 entry, err := kv.Get("foo") 4087 if err != nil { 4088 errCh <- err 4089 return 4090 } 4091 if v := string(entry.Value()); v != "baz" { 4092 errCh <- fmt.Errorf("Got wrong value: %q", v) 4093 } 4094 } 4095 } 4096 }() 4097 4098 // Restart 4099 c.restartServer(s) 4100 // Wait for a bit to make sure as this server participates in direct gets 4101 // it does not server stale reads. 4102 time.Sleep(2 * time.Second) 4103 close(done) 4104 4105 if len(errCh) > 0 { 4106 t.Fatalf("Expected no errors but got %v", <-errCh) 4107 } 4108 } 4109 4110 // This test mimics a user's setup where there is a cloud cluster/domain, and one for eu and ap that are leafnoded into the 4111 // cloud cluster, and one for cn that is leafnoded into the ap cluster. 4112 // We broke basic connectivity in 2.9.17 from publishing in eu for delivery in cn on same account which is daisy chained through ap. 4113 // We will also test cross account delivery in this test as well. 4114 func TestJetStreamClusterLeafnodePlusDaisyChainSetup(t *testing.T) { 4115 var cloudTmpl = ` 4116 listen: 127.0.0.1:-1 4117 server_name: %s 4118 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, domain: CLOUD, store_dir: '%s'} 4119 4120 leaf { listen: 127.0.0.1:-1 } 4121 4122 cluster { 4123 name: %s 4124 listen: 127.0.0.1:%d 4125 routes = [%s] 4126 } 4127 4128 accounts { 4129 F { 4130 jetstream: enabled 4131 users = [ { user: "F", pass: "pass" } ] 4132 exports [ { stream: "F.>" } ] 4133 } 4134 T { 4135 jetstream: enabled 4136 users = [ { user: "T", pass: "pass" } ] 4137 imports [ { stream: { account: F, subject: "F.>"} } ] 4138 } 4139 $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } 4140 }` 4141 4142 // Now create the cloud and make sure we are connected. 4143 // Cloud 4144 c := createJetStreamCluster(t, cloudTmpl, "CLOUD", _EMPTY_, 3, 22020, false) 4145 defer c.shutdown() 4146 4147 var lnTmpl = ` 4148 listen: 127.0.0.1:-1 4149 server_name: %s 4150 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 4151 4152 {{leaf}} 4153 4154 cluster { 4155 name: %s 4156 listen: 127.0.0.1:%d 4157 routes = [%s] 4158 } 4159 4160 accounts { 4161 F { 4162 jetstream: enabled 4163 users = [ { user: "F", pass: "pass" } ] 4164 exports [ { stream: "F.>" } ] 4165 } 4166 T { 4167 jetstream: enabled 4168 users = [ { user: "T", pass: "pass" } ] 4169 imports [ { stream: { account: F, subject: "F.>"} } ] 4170 } 4171 $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } 4172 }` 4173 4174 var leafFrag = ` 4175 leaf { 4176 listen: 127.0.0.1:-1 4177 remotes [ { urls: [ %s ], account: "T" }, { urls: [ %s ], account: "F" } ] 4178 }` 4179 4180 genLeafTmpl := func(tmpl string, c *cluster) string { 4181 t.Helper() 4182 // Create our leafnode cluster template first. 4183 var lnt, lnf []string 4184 for _, s := range c.servers { 4185 if s.ClusterName() != c.name { 4186 continue 4187 } 4188 ln := s.getOpts().LeafNode 4189 lnt = append(lnt, fmt.Sprintf("nats://T:pass@%s:%d", ln.Host, ln.Port)) 4190 lnf = append(lnf, fmt.Sprintf("nats://F:pass@%s:%d", ln.Host, ln.Port)) 4191 } 4192 lntc := strings.Join(lnt, ", ") 4193 lnfc := strings.Join(lnf, ", ") 4194 return strings.Replace(tmpl, "{{leaf}}", fmt.Sprintf(leafFrag, lntc, lnfc), 1) 4195 } 4196 4197 // Cluster EU 4198 // Domain is "EU' 4199 tmpl := strings.Replace(lnTmpl, "store_dir:", fmt.Sprintf(`domain: "%s", store_dir:`, "EU"), 1) 4200 tmpl = genLeafTmpl(tmpl, c) 4201 lceu := createJetStreamCluster(t, tmpl, "EU", "EU-", 3, 22110, false) 4202 lceu.waitOnClusterReady() 4203 defer lceu.shutdown() 4204 4205 for _, s := range lceu.servers { 4206 checkLeafNodeConnectedCount(t, s, 2) 4207 } 4208 4209 // Cluster AP 4210 // Domain is "AP' 4211 tmpl = strings.Replace(lnTmpl, "store_dir:", fmt.Sprintf(`domain: "%s", store_dir:`, "AP"), 1) 4212 tmpl = genLeafTmpl(tmpl, c) 4213 lcap := createJetStreamCluster(t, tmpl, "AP", "AP-", 3, 22180, false) 4214 lcap.waitOnClusterReady() 4215 defer lcap.shutdown() 4216 4217 for _, s := range lcap.servers { 4218 checkLeafNodeConnectedCount(t, s, 2) 4219 } 4220 4221 // Cluster CN 4222 // Domain is "CN' 4223 // This one connects to AP, not the cloud hub. 4224 tmpl = strings.Replace(lnTmpl, "store_dir:", fmt.Sprintf(`domain: "%s", store_dir:`, "CN"), 1) 4225 tmpl = genLeafTmpl(tmpl, lcap) 4226 lccn := createJetStreamCluster(t, tmpl, "CN", "CN-", 3, 22280, false) 4227 lccn.waitOnClusterReady() 4228 defer lccn.shutdown() 4229 4230 for _, s := range lccn.servers { 4231 checkLeafNodeConnectedCount(t, s, 2) 4232 } 4233 4234 // Now connect to CN on account F and subscribe to data. 4235 nc, _ := jsClientConnect(t, lccn.randomServer(), nats.UserInfo("F", "pass")) 4236 defer nc.Close() 4237 fsub, err := nc.SubscribeSync("F.EU.>") 4238 require_NoError(t, err) 4239 4240 // Same for account T where the import is. 4241 nc, _ = jsClientConnect(t, lccn.randomServer(), nats.UserInfo("T", "pass")) 4242 defer nc.Close() 4243 tsub, err := nc.SubscribeSync("F.EU.>") 4244 require_NoError(t, err) 4245 4246 // Let sub propagate. 4247 time.Sleep(500 * time.Millisecond) 4248 4249 // Now connect to EU on account F and generate data. 4250 nc, _ = jsClientConnect(t, lceu.randomServer(), nats.UserInfo("F", "pass")) 4251 defer nc.Close() 4252 4253 num := 10 4254 for i := 0; i < num; i++ { 4255 err := nc.Publish("F.EU.DATA", []byte(fmt.Sprintf("MSG-%d", i))) 4256 require_NoError(t, err) 4257 } 4258 4259 checkSubsPending(t, fsub, num) 4260 // Since we export and import in each cluster, we will receive 4x. 4261 // First hop from EU -> CLOUD is 1F and 1T 4262 // Second hop from CLOUD -> AP is 1F, 1T and another 1T 4263 // Third hop from AP -> CN is 1F, 1T, 1T and 1T 4264 // Each cluster hop that has the export/import mapping will add another T message copy. 4265 checkSubsPending(t, tsub, num*4) 4266 4267 // Create stream in cloud. 4268 nc, js := jsClientConnect(t, c.randomServer(), nats.UserInfo("F", "pass")) 4269 defer nc.Close() 4270 4271 _, err = js.AddStream(&nats.StreamConfig{ 4272 Name: "TEST", 4273 Subjects: []string{"TEST.>"}, 4274 Replicas: 3, 4275 }) 4276 require_NoError(t, err) 4277 4278 for i := 0; i < 100; i++ { 4279 sendStreamMsg(t, nc, fmt.Sprintf("TEST.%d", i), "OK") 4280 } 4281 4282 // Now connect to EU. 4283 nc, js = jsClientConnect(t, lceu.randomServer(), nats.UserInfo("F", "pass")) 4284 defer nc.Close() 4285 4286 // Create a mirror. 4287 _, err = js.AddStream(&nats.StreamConfig{ 4288 Name: "M", 4289 Mirror: &nats.StreamSource{ 4290 Name: "TEST", 4291 Domain: "CLOUD", 4292 }, 4293 }) 4294 require_NoError(t, err) 4295 4296 checkFor(t, time.Second, 200*time.Millisecond, func() error { 4297 si, err := js.StreamInfo("M") 4298 require_NoError(t, err) 4299 if si.State.Msgs == 100 { 4300 return nil 4301 } 4302 return fmt.Errorf("State not current: %+v", si.State) 4303 }) 4304 } 4305 4306 // https://github.com/nats-io/nats-server/pull/4197 4307 func TestJetStreamClusterPurgeExReplayAfterRestart(t *testing.T) { 4308 c := createJetStreamClusterExplicit(t, "P3F", 3) 4309 defer c.shutdown() 4310 4311 // Client based API 4312 nc, js := jsClientConnect(t, c.randomServer()) 4313 defer nc.Close() 4314 4315 _, err := js.AddStream(&nats.StreamConfig{ 4316 Name: "TEST", 4317 Subjects: []string{"TEST.>"}, 4318 Replicas: 3, 4319 }) 4320 require_NoError(t, err) 4321 4322 sendStreamMsg(t, nc, "TEST.0", "OK") 4323 sendStreamMsg(t, nc, "TEST.1", "OK") 4324 sendStreamMsg(t, nc, "TEST.2", "OK") 4325 4326 runTest := func(f func(js nats.JetStreamManager)) *nats.StreamInfo { 4327 nc, js := jsClientConnect(t, c.randomServer()) 4328 defer nc.Close() 4329 4330 // install snapshot, then execute interior func, ensuring the purge will be recovered later 4331 fsl := c.streamLeader(globalAccountName, "TEST") 4332 fsl.JetStreamSnapshotStream(globalAccountName, "TEST") 4333 4334 f(js) 4335 time.Sleep(250 * time.Millisecond) 4336 4337 fsl.Shutdown() 4338 fsl.WaitForShutdown() 4339 fsl = c.restartServer(fsl) 4340 c.waitOnServerCurrent(fsl) 4341 4342 nc, js = jsClientConnect(t, c.randomServer()) 4343 defer nc.Close() 4344 4345 c.waitOnStreamLeader(globalAccountName, "TEST") 4346 sl := c.streamLeader(globalAccountName, "TEST") 4347 4348 // keep stepping down so the stream leader matches the initial leader 4349 // we need to check if it restored from the snapshot properly 4350 for sl != fsl { 4351 _, err := nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second) 4352 require_NoError(t, err) 4353 c.waitOnStreamLeader(globalAccountName, "TEST") 4354 sl = c.streamLeader(globalAccountName, "TEST") 4355 } 4356 4357 si, err := js.StreamInfo("TEST") 4358 require_NoError(t, err) 4359 return si 4360 } 4361 si := runTest(func(js nats.JetStreamManager) { 4362 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Subject: "TEST.0"}) 4363 require_NoError(t, err) 4364 }) 4365 if si.State.Msgs != 2 { 4366 t.Fatalf("Expected 2 msgs after restart, got %d", si.State.Msgs) 4367 } 4368 if si.State.FirstSeq != 2 || si.State.LastSeq != 3 { 4369 t.Fatalf("Expected FirstSeq=2, LastSeq=3 after restart, got FirstSeq=%d, LastSeq=%d", 4370 si.State.FirstSeq, si.State.LastSeq) 4371 } 4372 4373 si = runTest(func(js nats.JetStreamManager) { 4374 err = js.PurgeStream("TEST") 4375 require_NoError(t, err) 4376 // Send 2 more messages. 4377 sendStreamMsg(t, nc, "TEST.1", "OK") 4378 sendStreamMsg(t, nc, "TEST.2", "OK") 4379 }) 4380 if si.State.Msgs != 2 { 4381 t.Fatalf("Expected 2 msgs after restart, got %d", si.State.Msgs) 4382 } 4383 if si.State.FirstSeq != 4 || si.State.LastSeq != 5 { 4384 t.Fatalf("Expected FirstSeq=4, LastSeq=5 after restart, got FirstSeq=%d, LastSeq=%d", 4385 si.State.FirstSeq, si.State.LastSeq) 4386 } 4387 4388 // Now test a keep 4389 si = runTest(func(js nats.JetStreamManager) { 4390 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Keep: 1}) 4391 require_NoError(t, err) 4392 // Send 4 more messages. 4393 sendStreamMsg(t, nc, "TEST.1", "OK") 4394 sendStreamMsg(t, nc, "TEST.2", "OK") 4395 sendStreamMsg(t, nc, "TEST.3", "OK") 4396 sendStreamMsg(t, nc, "TEST.1", "OK") 4397 }) 4398 if si.State.Msgs != 5 { 4399 t.Fatalf("Expected 5 msgs after restart, got %d", si.State.Msgs) 4400 } 4401 if si.State.FirstSeq != 5 || si.State.LastSeq != 9 { 4402 t.Fatalf("Expected FirstSeq=5, LastSeq=9 after restart, got FirstSeq=%d, LastSeq=%d", 4403 si.State.FirstSeq, si.State.LastSeq) 4404 } 4405 4406 // Now test a keep on a subject 4407 si = runTest(func(js nats.JetStreamManager) { 4408 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Subject: "TEST.1", Keep: 1}) 4409 require_NoError(t, err) 4410 // Send 3 more messages. 4411 sendStreamMsg(t, nc, "TEST.1", "OK") 4412 sendStreamMsg(t, nc, "TEST.2", "OK") 4413 sendStreamMsg(t, nc, "TEST.3", "OK") 4414 }) 4415 if si.State.Msgs != 7 { 4416 t.Fatalf("Expected 7 msgs after restart, got %d", si.State.Msgs) 4417 } 4418 if si.State.FirstSeq != 5 || si.State.LastSeq != 12 { 4419 t.Fatalf("Expected FirstSeq=5, LastSeq=12 after restart, got FirstSeq=%d, LastSeq=%d", 4420 si.State.FirstSeq, si.State.LastSeq) 4421 } 4422 } 4423 4424 func TestJetStreamClusterConsumerCleanupWithSameName(t *testing.T) { 4425 c := createJetStreamClusterExplicit(t, "R3F", 3) 4426 defer c.shutdown() 4427 4428 // Client based API 4429 nc, js := jsClientConnect(t, c.randomServer()) 4430 defer nc.Close() 4431 4432 _, err := js.AddStream(&nats.StreamConfig{ 4433 Name: "UPDATES", 4434 Subjects: []string{"DEVICE.*"}, 4435 Replicas: 3, 4436 }) 4437 require_NoError(t, err) 4438 4439 // Create a consumer that will be an R1 that we will auto-recreate but using the same name. 4440 // We want to make sure that the system does not continually try to cleanup the new one from the old one. 4441 4442 // Track the sequence for restart etc. 4443 var seq atomic.Uint64 4444 4445 msgCB := func(msg *nats.Msg) { 4446 msg.AckSync() 4447 meta, err := msg.Metadata() 4448 require_NoError(t, err) 4449 seq.Store(meta.Sequence.Stream) 4450 } 4451 4452 waitOnSeqDelivered := func(expected uint64) { 4453 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 4454 received := seq.Load() 4455 if received == expected { 4456 return nil 4457 } 4458 return fmt.Errorf("Seq is %d, want %d", received, expected) 4459 }) 4460 } 4461 4462 doSub := func() { 4463 _, err = js.Subscribe( 4464 "DEVICE.22", 4465 msgCB, 4466 nats.ConsumerName("dlc"), 4467 nats.SkipConsumerLookup(), 4468 nats.StartSequence(seq.Load()+1), 4469 nats.MaxAckPending(1), // One at a time. 4470 nats.ManualAck(), 4471 nats.ConsumerReplicas(1), 4472 nats.ConsumerMemoryStorage(), 4473 nats.MaxDeliver(1), 4474 nats.InactiveThreshold(time.Second), 4475 nats.IdleHeartbeat(250*time.Millisecond), 4476 ) 4477 require_NoError(t, err) 4478 } 4479 4480 // Track any errors for consumer not active so we can recreate the consumer. 4481 errCh := make(chan error, 10) 4482 nc.SetErrorHandler(func(c *nats.Conn, s *nats.Subscription, err error) { 4483 if errors.Is(err, nats.ErrConsumerNotActive) { 4484 s.Unsubscribe() 4485 errCh <- err 4486 doSub() 4487 } 4488 }) 4489 4490 doSub() 4491 4492 sendStreamMsg(t, nc, "DEVICE.22", "update-1") 4493 sendStreamMsg(t, nc, "DEVICE.22", "update-2") 4494 sendStreamMsg(t, nc, "DEVICE.22", "update-3") 4495 waitOnSeqDelivered(3) 4496 4497 // Shutdown the consumer's leader. 4498 s := c.consumerLeader(globalAccountName, "UPDATES", "dlc") 4499 s.Shutdown() 4500 c.waitOnStreamLeader(globalAccountName, "UPDATES") 4501 4502 // In case our client connection was to the same server. 4503 nc, _ = jsClientConnect(t, c.randomServer()) 4504 defer nc.Close() 4505 4506 sendStreamMsg(t, nc, "DEVICE.22", "update-4") 4507 sendStreamMsg(t, nc, "DEVICE.22", "update-5") 4508 sendStreamMsg(t, nc, "DEVICE.22", "update-6") 4509 4510 // Wait for the consumer not active error. 4511 <-errCh 4512 // Now restart server with the old consumer. 4513 c.restartServer(s) 4514 // Wait on all messages delivered. 4515 waitOnSeqDelivered(6) 4516 // Make sure no other errors showed up 4517 require_True(t, len(errCh) == 0) 4518 } 4519 func TestJetStreamClusterConsumerActions(t *testing.T) { 4520 c := createJetStreamClusterExplicit(t, "R3F", 3) 4521 defer c.shutdown() 4522 4523 nc, js := jsClientConnect(t, c.randomServer()) 4524 defer nc.Close() 4525 4526 var err error 4527 _, err = js.AddStream(&nats.StreamConfig{ 4528 Name: "TEST", 4529 Subjects: []string{"test"}, 4530 }) 4531 require_NoError(t, err) 4532 4533 ecSubj := fmt.Sprintf(JSApiConsumerCreateExT, "TEST", "CONSUMER", "test") 4534 crReq := CreateConsumerRequest{ 4535 Stream: "TEST", 4536 Config: ConsumerConfig{ 4537 DeliverPolicy: DeliverLast, 4538 FilterSubject: "test", 4539 AckPolicy: AckExplicit, 4540 }, 4541 } 4542 4543 // A new consumer. Should not be an error. 4544 crReq.Action = ActionCreate 4545 req, err := json.Marshal(crReq) 4546 require_NoError(t, err) 4547 resp, err := nc.Request(ecSubj, req, 500*time.Millisecond) 4548 require_NoError(t, err) 4549 var ccResp JSApiConsumerCreateResponse 4550 err = json.Unmarshal(resp.Data, &ccResp) 4551 require_NoError(t, err) 4552 if ccResp.Error != nil { 4553 t.Fatalf("Unexpected error: %v", ccResp.Error) 4554 } 4555 ccResp.Error = nil 4556 4557 // Consumer exists, but config is the same, so should be ok 4558 resp, err = nc.Request(ecSubj, req, 500*time.Millisecond) 4559 require_NoError(t, err) 4560 err = json.Unmarshal(resp.Data, &ccResp) 4561 require_NoError(t, err) 4562 if ccResp.Error != nil { 4563 t.Fatalf("Unexpected er response: %v", ccResp.Error) 4564 } 4565 ccResp.Error = nil 4566 // Consumer exists. Config is different, so should error 4567 crReq.Config.Description = "changed" 4568 req, err = json.Marshal(crReq) 4569 require_NoError(t, err) 4570 resp, err = nc.Request(ecSubj, req, 500*time.Millisecond) 4571 require_NoError(t, err) 4572 err = json.Unmarshal(resp.Data, &ccResp) 4573 require_NoError(t, err) 4574 if ccResp.Error == nil { 4575 t.Fatalf("Unexpected ok response") 4576 } 4577 4578 ccResp.Error = nil 4579 // Consumer update, so update should be ok 4580 crReq.Action = ActionUpdate 4581 crReq.Config.Description = "changed again" 4582 req, err = json.Marshal(crReq) 4583 require_NoError(t, err) 4584 resp, err = nc.Request(ecSubj, req, 500*time.Millisecond) 4585 require_NoError(t, err) 4586 err = json.Unmarshal(resp.Data, &ccResp) 4587 require_NoError(t, err) 4588 if ccResp.Error != nil { 4589 t.Fatalf("Unexpected error response: %v", ccResp.Error) 4590 } 4591 4592 ecSubj = fmt.Sprintf(JSApiConsumerCreateExT, "TEST", "NEW", "test") 4593 ccResp.Error = nil 4594 // Updating new consumer, so should error 4595 crReq.Config.Name = "NEW" 4596 req, err = json.Marshal(crReq) 4597 require_NoError(t, err) 4598 resp, err = nc.Request(ecSubj, req, 500*time.Millisecond) 4599 require_NoError(t, err) 4600 err = json.Unmarshal(resp.Data, &ccResp) 4601 require_NoError(t, err) 4602 if ccResp.Error == nil { 4603 t.Fatalf("Unexpected ok response") 4604 } 4605 } 4606 4607 func TestJetStreamClusterSnapshotAndRestoreWithHealthz(t *testing.T) { 4608 c := createJetStreamClusterExplicit(t, "R3S", 3) 4609 defer c.shutdown() 4610 4611 nc, js := jsClientConnect(t, c.randomServer()) 4612 defer nc.Close() 4613 4614 _, err := js.AddStream(&nats.StreamConfig{ 4615 Name: "TEST", 4616 Subjects: []string{"foo"}, 4617 Replicas: 3, 4618 }) 4619 require_NoError(t, err) 4620 4621 toSend, msg := 1000, bytes.Repeat([]byte("Z"), 1024) 4622 for i := 0; i < toSend; i++ { 4623 _, err := js.PublishAsync("foo", msg) 4624 require_NoError(t, err) 4625 } 4626 select { 4627 case <-js.PublishAsyncComplete(): 4628 case <-time.After(5 * time.Second): 4629 t.Fatalf("Did not receive completion signal") 4630 } 4631 4632 sreq := &JSApiStreamSnapshotRequest{ 4633 DeliverSubject: nats.NewInbox(), 4634 ChunkSize: 512, 4635 } 4636 req, _ := json.Marshal(sreq) 4637 rmsg, err := nc.Request(fmt.Sprintf(JSApiStreamSnapshotT, "TEST"), req, time.Second) 4638 require_NoError(t, err) 4639 4640 var resp JSApiStreamSnapshotResponse 4641 json.Unmarshal(rmsg.Data, &resp) 4642 require_True(t, resp.Error == nil) 4643 4644 state := *resp.State 4645 cfg := *resp.Config 4646 4647 var snapshot []byte 4648 done := make(chan bool) 4649 4650 sub, _ := nc.Subscribe(sreq.DeliverSubject, func(m *nats.Msg) { 4651 // EOF 4652 if len(m.Data) == 0 { 4653 done <- true 4654 return 4655 } 4656 // Could be writing to a file here too. 4657 snapshot = append(snapshot, m.Data...) 4658 // Flow ack 4659 m.Respond(nil) 4660 }) 4661 defer sub.Unsubscribe() 4662 4663 // Wait to receive the snapshot. 4664 select { 4665 case <-done: 4666 case <-time.After(5 * time.Second): 4667 t.Fatalf("Did not receive our snapshot in time") 4668 } 4669 4670 // Delete before we try to restore. 4671 require_NoError(t, js.DeleteStream("TEST")) 4672 4673 checkHealth := func() { 4674 for _, s := range c.servers { 4675 s.healthz(nil) 4676 } 4677 } 4678 4679 var rresp JSApiStreamRestoreResponse 4680 rreq := &JSApiStreamRestoreRequest{ 4681 Config: cfg, 4682 State: state, 4683 } 4684 req, _ = json.Marshal(rreq) 4685 4686 rmsg, err = nc.Request(fmt.Sprintf(JSApiStreamRestoreT, "TEST"), req, 5*time.Second) 4687 require_NoError(t, err) 4688 4689 rresp.Error = nil 4690 json.Unmarshal(rmsg.Data, &rresp) 4691 require_True(t, resp.Error == nil) 4692 4693 checkHealth() 4694 4695 // We will now chunk the snapshot responses (and EOF). 4696 var chunk [1024]byte 4697 for i, r := 0, bytes.NewReader(snapshot); ; { 4698 n, err := r.Read(chunk[:]) 4699 if err != nil { 4700 break 4701 } 4702 nc.Request(rresp.DeliverSubject, chunk[:n], time.Second) 4703 i++ 4704 // We will call healthz for all servers half way through the restore. 4705 if i%100 == 0 { 4706 checkHealth() 4707 } 4708 } 4709 rmsg, err = nc.Request(rresp.DeliverSubject, nil, time.Second) 4710 require_NoError(t, err) 4711 rresp.Error = nil 4712 json.Unmarshal(rmsg.Data, &rresp) 4713 require_True(t, resp.Error == nil) 4714 4715 si, err := js.StreamInfo("TEST") 4716 require_NoError(t, err) 4717 require_True(t, si.State.Msgs == uint64(toSend)) 4718 4719 // Make sure stepdown works, this would fail before the fix. 4720 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, 5*time.Second) 4721 require_NoError(t, err) 4722 4723 si, err = js.StreamInfo("TEST") 4724 require_NoError(t, err) 4725 require_True(t, si.State.Msgs == uint64(toSend)) 4726 } 4727 4728 func TestJetStreamClusterBinaryStreamSnapshotCapability(t *testing.T) { 4729 c := createJetStreamClusterExplicit(t, "NATS", 3) 4730 defer c.shutdown() 4731 4732 nc, js := jsClientConnect(t, c.randomServer()) 4733 defer nc.Close() 4734 4735 _, err := js.AddStream(&nats.StreamConfig{ 4736 Name: "TEST", 4737 Subjects: []string{"foo"}, 4738 Replicas: 3, 4739 }) 4740 require_NoError(t, err) 4741 4742 mset, err := c.streamLeader(globalAccountName, "TEST").GlobalAccount().lookupStream("TEST") 4743 require_NoError(t, err) 4744 4745 if !mset.supportsBinarySnapshot() { 4746 t.Fatalf("Expected to signal that we could support binary stream snapshots") 4747 } 4748 } 4749 4750 func TestJetStreamClusterBadEncryptKey(t *testing.T) { 4751 c := createJetStreamClusterWithTemplate(t, jsClusterEncryptedTempl, "JSC", 3) 4752 defer c.shutdown() 4753 4754 nc, js := jsClientConnect(t, c.randomServer()) 4755 defer nc.Close() 4756 4757 // Create 10 streams. 4758 for i := 0; i < 10; i++ { 4759 _, err := js.AddStream(&nats.StreamConfig{ 4760 Name: fmt.Sprintf("TEST-%d", i), 4761 Replicas: 3, 4762 }) 4763 require_NoError(t, err) 4764 } 4765 4766 // Grab random server. 4767 s := c.randomServer() 4768 s.Shutdown() 4769 s.WaitForShutdown() 4770 4771 var opts *Options 4772 for i := 0; i < len(c.servers); i++ { 4773 if c.servers[i] == s { 4774 opts = c.opts[i] 4775 break 4776 } 4777 } 4778 require_NotNil(t, opts) 4779 4780 // Replace key with an empty key. 4781 buf, err := os.ReadFile(opts.ConfigFile) 4782 require_NoError(t, err) 4783 nbuf := bytes.Replace(buf, []byte("key: \"s3cr3t!\""), []byte("key: \"\""), 1) 4784 err = os.WriteFile(opts.ConfigFile, nbuf, 0640) 4785 require_NoError(t, err) 4786 4787 // Make sure trying to start the server now fails. 4788 s, err = NewServer(LoadConfig(opts.ConfigFile)) 4789 require_NoError(t, err) 4790 require_NotNil(t, s) 4791 s.Start() 4792 if err := s.readyForConnections(1 * time.Second); err == nil { 4793 t.Fatalf("Expected server not to start") 4794 } 4795 } 4796 4797 func TestJetStreamClusterAccountUsageDrifts(t *testing.T) { 4798 tmpl := ` 4799 listen: 127.0.0.1:-1 4800 server_name: %s 4801 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 4802 leaf { 4803 listen: 127.0.0.1:-1 4804 } 4805 cluster { 4806 name: %s 4807 listen: 127.0.0.1:%d 4808 routes = [%s] 4809 } 4810 ` 4811 opFrag := ` 4812 operator: %s 4813 system_account: %s 4814 resolver: { type: MEM } 4815 resolver_preload = { 4816 %s : %s 4817 %s : %s 4818 } 4819 ` 4820 4821 _, syspub := createKey(t) 4822 sysJwt := encodeClaim(t, jwt.NewAccountClaims(syspub), syspub) 4823 4824 accKp, aExpPub := createKey(t) 4825 accClaim := jwt.NewAccountClaims(aExpPub) 4826 accClaim.Limits.JetStreamTieredLimits["R1"] = jwt.JetStreamLimits{ 4827 DiskStorage: -1, Consumer: 1, Streams: 1} 4828 accClaim.Limits.JetStreamTieredLimits["R3"] = jwt.JetStreamLimits{ 4829 DiskStorage: -1, Consumer: 1, Streams: 1} 4830 accJwt := encodeClaim(t, accClaim, aExpPub) 4831 accCreds := newUser(t, accKp) 4832 4833 template := tmpl + fmt.Sprintf(opFrag, ojwt, syspub, syspub, sysJwt, aExpPub, accJwt) 4834 c := createJetStreamClusterWithTemplate(t, template, "R3S", 3) 4835 defer c.shutdown() 4836 4837 nc, js := jsClientConnect(t, c.randomServer(), nats.UserCredentials(accCreds)) 4838 defer nc.Close() 4839 4840 _, err := js.AddStream(&nats.StreamConfig{ 4841 Name: "TEST1", 4842 Subjects: []string{"foo"}, 4843 MaxBytes: 1 * 1024 * 1024 * 1024, 4844 MaxMsgs: 1000, 4845 Replicas: 3, 4846 }) 4847 require_NoError(t, err) 4848 4849 _, err = js.AddStream(&nats.StreamConfig{ 4850 Name: "TEST2", 4851 Subjects: []string{"bar"}, 4852 }) 4853 require_NoError(t, err) 4854 4855 // These expected store values can come directly from stream info's state bytes. 4856 // We will *= 3 for R3 4857 checkAccount := func(r1u, r3u uint64) { 4858 t.Helper() 4859 r3u *= 3 4860 4861 // Remote usage updates can be delayed, so wait for a bit for values we want. 4862 checkFor(t, 10*time.Second, 250*time.Millisecond, func() error { 4863 info, err := js.AccountInfo() 4864 require_NoError(t, err) 4865 require_True(t, len(info.Tiers) >= 2) 4866 // These can move. 4867 if u := info.Tiers["R1"].Store; u != r1u { 4868 return fmt.Errorf("Expected R1 to be %v, got %v", friendlyBytes(r1u), friendlyBytes(u)) 4869 } 4870 if u := info.Tiers["R3"].Store; u != r3u { 4871 return fmt.Errorf("Expected R3 to be %v, got %v", friendlyBytes(r3u), friendlyBytes(u)) 4872 } 4873 return nil 4874 }) 4875 } 4876 4877 checkAccount(0, 0) 4878 4879 // Now add in some R3 data. 4880 msg := bytes.Repeat([]byte("Z"), 32*1024) // 32k 4881 smallMsg := bytes.Repeat([]byte("Z"), 4*1024) // 4k 4882 4883 for i := 0; i < 1000; i++ { 4884 js.Publish("foo", msg) 4885 } 4886 sir3, err := js.StreamInfo("TEST1") 4887 require_NoError(t, err) 4888 4889 checkAccount(0, sir3.State.Bytes) 4890 4891 // Now add in some R1 data. 4892 for i := 0; i < 100; i++ { 4893 js.Publish("bar", msg) 4894 } 4895 4896 sir1, err := js.StreamInfo("TEST2") 4897 require_NoError(t, err) 4898 4899 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 4900 4901 // We will now test a bunch of scenarios to see that we are doing accounting correctly. 4902 4903 // Since our R3 has a limit of 1000 msgs, let's add in more msgs and drop older ones. 4904 for i := 0; i < 100; i++ { 4905 js.Publish("foo", smallMsg) 4906 } 4907 sir3, err = js.StreamInfo("TEST1") 4908 require_NoError(t, err) 4909 4910 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 4911 4912 // Move our R3 stream leader and make sure acounting is correct. 4913 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST1"), nil, time.Second) 4914 require_NoError(t, err) 4915 4916 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 4917 4918 // Now scale down. 4919 _, err = js.UpdateStream(&nats.StreamConfig{ 4920 Name: "TEST1", 4921 Subjects: []string{"foo"}, 4922 MaxBytes: 1 * 1024 * 1024 * 1024, 4923 MaxMsgs: 1000, 4924 Replicas: 1, 4925 }) 4926 require_NoError(t, err) 4927 4928 checkAccount(sir1.State.Bytes+sir3.State.Bytes, 0) 4929 4930 // Add in more msgs which will replace the older and bigger ones. 4931 for i := 0; i < 100; i++ { 4932 js.Publish("foo", smallMsg) 4933 } 4934 sir3, err = js.StreamInfo("TEST1") 4935 require_NoError(t, err) 4936 4937 // Now scale back up. 4938 _, err = js.UpdateStream(&nats.StreamConfig{ 4939 Name: "TEST1", 4940 Subjects: []string{"foo"}, 4941 MaxBytes: 1 * 1024 * 1024 * 1024, 4942 MaxMsgs: 1000, 4943 Replicas: 3, 4944 }) 4945 require_NoError(t, err) 4946 4947 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 4948 4949 // Test Purge. 4950 err = js.PurgeStream("TEST1") 4951 require_NoError(t, err) 4952 4953 checkAccount(sir1.State.Bytes, 0) 4954 4955 for i := 0; i < 1000; i++ { 4956 js.Publish("foo", smallMsg) 4957 } 4958 sir3, err = js.StreamInfo("TEST1") 4959 require_NoError(t, err) 4960 4961 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 4962 4963 requestLeaderStepDown := func() { 4964 ml := c.leader() 4965 checkFor(t, 5*time.Second, 250*time.Millisecond, func() error { 4966 if cml := c.leader(); cml == ml { 4967 nc.Request(JSApiLeaderStepDown, nil, time.Second) 4968 return fmt.Errorf("Metaleader has not moved yet") 4969 } 4970 return nil 4971 }) 4972 } 4973 4974 // Test meta leader stepdowns. 4975 for i := 0; i < len(c.servers); i++ { 4976 requestLeaderStepDown() 4977 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 4978 } 4979 4980 // Now test cluster reset operations where we internally reset the NRG and optionally the stream too. 4981 // Only applicable to TEST1 stream which is R3. 4982 nl := c.randomNonStreamLeader(aExpPub, "TEST1") 4983 acc, err := nl.LookupAccount(aExpPub) 4984 require_NoError(t, err) 4985 mset, err := acc.lookupStream("TEST1") 4986 require_NoError(t, err) 4987 // NRG only 4988 mset.resetClusteredState(nil) 4989 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 4990 // Need to re-lookup this stream since we will recreate from reset above. 4991 checkFor(t, 5*time.Second, 200*time.Millisecond, func() error { 4992 mset, err = acc.lookupStream("TEST1") 4993 return err 4994 }) 4995 // Now NRG and Stream state itself. 4996 mset.resetClusteredState(errFirstSequenceMismatch) 4997 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 4998 4999 // Now test server restart 5000 for _, s := range c.servers { 5001 s.Shutdown() 5002 s.WaitForShutdown() 5003 s = c.restartServer(s) 5004 5005 // Wait on healthz and leader etc. 5006 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 5007 if hs := s.healthz(nil); hs.Error != _EMPTY_ { 5008 return errors.New(hs.Error) 5009 } 5010 return nil 5011 }) 5012 c.waitOnLeader() 5013 c.waitOnStreamLeader(aExpPub, "TEST1") 5014 c.waitOnStreamLeader(aExpPub, "TEST2") 5015 5016 // Now check account again. 5017 checkAccount(sir1.State.Bytes, sir3.State.Bytes) 5018 } 5019 } 5020 5021 func TestJetStreamClusterStreamFailTracking(t *testing.T) { 5022 c := createJetStreamClusterExplicit(t, "R3S", 3) 5023 defer c.shutdown() 5024 5025 nc, js := jsClientConnect(t, c.randomServer()) 5026 defer nc.Close() 5027 5028 _, err := js.AddStream(&nats.StreamConfig{ 5029 Name: "TEST", 5030 Subjects: []string{"foo"}, 5031 Replicas: 3, 5032 }) 5033 require_NoError(t, err) 5034 5035 m := nats.NewMsg("foo") 5036 m.Data = []byte("OK") 5037 5038 b, bsz := 0, 5 5039 sendBatch := func() { 5040 for i := b * bsz; i < b*bsz+bsz; i++ { 5041 msgId := fmt.Sprintf("ID:%d", i) 5042 m.Header.Set(JSMsgId, msgId) 5043 // Send it twice on purpose. 5044 js.PublishMsg(m) 5045 js.PublishMsg(m) 5046 } 5047 b++ 5048 } 5049 5050 sendBatch() 5051 5052 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second) 5053 require_NoError(t, err) 5054 c.waitOnStreamLeader(globalAccountName, "TEST") 5055 5056 sendBatch() 5057 5058 // Now stop one and restart. 5059 nl := c.randomNonStreamLeader(globalAccountName, "TEST") 5060 mset, err := nl.GlobalAccount().lookupStream("TEST") 5061 require_NoError(t, err) 5062 // Reset raft 5063 mset.resetClusteredState(nil) 5064 time.Sleep(100 * time.Millisecond) 5065 5066 nl.Shutdown() 5067 nl.WaitForShutdown() 5068 5069 sendBatch() 5070 5071 nl = c.restartServer(nl) 5072 5073 sendBatch() 5074 5075 for { 5076 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second) 5077 require_NoError(t, err) 5078 c.waitOnStreamLeader(globalAccountName, "TEST") 5079 if nl == c.streamLeader(globalAccountName, "TEST") { 5080 break 5081 } 5082 } 5083 5084 sendBatch() 5085 5086 _, err = js.UpdateStream(&nats.StreamConfig{ 5087 Name: "TEST", 5088 Subjects: []string{"foo"}, 5089 Replicas: 1, 5090 }) 5091 require_NoError(t, err) 5092 5093 // Make sure all in order. 5094 errCh := make(chan error, 100) 5095 var wg sync.WaitGroup 5096 wg.Add(1) 5097 5098 expected, seen := b*bsz, 0 5099 5100 sub, err := js.Subscribe("foo", func(msg *nats.Msg) { 5101 expectedID := fmt.Sprintf("ID:%d", seen) 5102 if v := msg.Header.Get(JSMsgId); v != expectedID { 5103 errCh <- err 5104 wg.Done() 5105 msg.Sub.Unsubscribe() 5106 return 5107 } 5108 seen++ 5109 if seen >= expected { 5110 wg.Done() 5111 msg.Sub.Unsubscribe() 5112 } 5113 }) 5114 require_NoError(t, err) 5115 defer sub.Unsubscribe() 5116 5117 wg.Wait() 5118 if len(errCh) > 0 { 5119 t.Fatalf("Expected no errors, got %d", len(errCh)) 5120 } 5121 } 5122 5123 func TestJetStreamClusterStreamFailTrackingSnapshots(t *testing.T) { 5124 c := createJetStreamClusterExplicit(t, "R3S", 3) 5125 defer c.shutdown() 5126 5127 nc, js := jsClientConnect(t, c.randomServer()) 5128 defer nc.Close() 5129 5130 _, err := js.AddStream(&nats.StreamConfig{ 5131 Name: "TEST", 5132 Subjects: []string{"foo"}, 5133 Replicas: 3, 5134 }) 5135 require_NoError(t, err) 5136 5137 m := nats.NewMsg("foo") 5138 m.Data = []byte("OK") 5139 5140 // Send 1000 a dupe every msgID. 5141 for i := 0; i < 1000; i++ { 5142 msgId := fmt.Sprintf("ID:%d", i) 5143 m.Header.Set(JSMsgId, msgId) 5144 // Send it twice on purpose. 5145 js.PublishMsg(m) 5146 js.PublishMsg(m) 5147 } 5148 5149 // Now stop one. 5150 nl := c.randomNonStreamLeader(globalAccountName, "TEST") 5151 nl.Shutdown() 5152 nl.WaitForShutdown() 5153 5154 // Now send more and make sure leader snapshots. 5155 for i := 1000; i < 2000; i++ { 5156 msgId := fmt.Sprintf("ID:%d", i) 5157 m.Header.Set(JSMsgId, msgId) 5158 // Send it twice on purpose. 5159 js.PublishMsg(m) 5160 js.PublishMsg(m) 5161 } 5162 5163 sl := c.streamLeader(globalAccountName, "TEST") 5164 mset, err := sl.GlobalAccount().lookupStream("TEST") 5165 require_NoError(t, err) 5166 node := mset.raftNode() 5167 require_NotNil(t, node) 5168 node.InstallSnapshot(mset.stateSnapshot()) 5169 5170 // Now restart nl 5171 nl = c.restartServer(nl) 5172 c.waitOnServerCurrent(nl) 5173 5174 // Move leader to NL 5175 for { 5176 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "TEST"), nil, time.Second) 5177 require_NoError(t, err) 5178 c.waitOnStreamLeader(globalAccountName, "TEST") 5179 if nl == c.streamLeader(globalAccountName, "TEST") { 5180 break 5181 } 5182 } 5183 5184 _, err = js.UpdateStream(&nats.StreamConfig{ 5185 Name: "TEST", 5186 Subjects: []string{"foo"}, 5187 Replicas: 1, 5188 }) 5189 require_NoError(t, err) 5190 5191 // Make sure all in order. 5192 errCh := make(chan error, 100) 5193 var wg sync.WaitGroup 5194 wg.Add(1) 5195 5196 expected, seen := 2000, 0 5197 5198 sub, err := js.Subscribe("foo", func(msg *nats.Msg) { 5199 expectedID := fmt.Sprintf("ID:%d", seen) 5200 if v := msg.Header.Get(JSMsgId); v != expectedID { 5201 errCh <- err 5202 wg.Done() 5203 msg.Sub.Unsubscribe() 5204 return 5205 } 5206 seen++ 5207 if seen >= expected { 5208 wg.Done() 5209 msg.Sub.Unsubscribe() 5210 } 5211 }) 5212 require_NoError(t, err) 5213 defer sub.Unsubscribe() 5214 5215 wg.Wait() 5216 if len(errCh) > 0 { 5217 t.Fatalf("Expected no errors, got %d", len(errCh)) 5218 } 5219 } 5220 5221 func TestJetStreamClusterOrphanConsumerSubjects(t *testing.T) { 5222 c := createJetStreamClusterExplicit(t, "R3S", 3) 5223 defer c.shutdown() 5224 5225 nc, js := jsClientConnect(t, c.randomServer()) 5226 defer nc.Close() 5227 5228 _, err := js.AddStream(&nats.StreamConfig{ 5229 Name: "TEST", 5230 Subjects: []string{"foo.>", "bar.>"}, 5231 Replicas: 3, 5232 }) 5233 require_NoError(t, err) 5234 5235 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5236 Name: "consumer_foo", 5237 Durable: "consumer_foo", 5238 FilterSubject: "foo.something", 5239 }) 5240 require_NoError(t, err) 5241 5242 for _, replicas := range []int{3, 1, 3} { 5243 _, err = js.UpdateStream(&nats.StreamConfig{ 5244 Name: "TEST", 5245 Subjects: []string{"bar.>"}, 5246 Replicas: replicas, 5247 }) 5248 require_NoError(t, err) 5249 c.waitOnAllCurrent() 5250 } 5251 5252 c.waitOnStreamLeader("$G", "TEST") 5253 c.waitOnConsumerLeader("$G", "TEST", "consumer_foo") 5254 5255 info, err := js.ConsumerInfo("TEST", "consumer_foo") 5256 require_NoError(t, err) 5257 require_True(t, info.Cluster != nil) 5258 require_NotEqual(t, info.Cluster.Leader, "") 5259 require_Equal(t, len(info.Cluster.Replicas), 2) 5260 } 5261 5262 func TestJetStreamClusterDurableConsumerInactiveThresholdLeaderSwitch(t *testing.T) { 5263 c := createJetStreamClusterExplicit(t, "R3S", 3) 5264 defer c.shutdown() 5265 5266 nc, js := jsClientConnect(t, c.randomServer()) 5267 defer nc.Close() 5268 5269 _, err := js.AddStream(&nats.StreamConfig{ 5270 Name: "TEST", 5271 Subjects: []string{"*"}, 5272 Replicas: 3, 5273 }) 5274 require_NoError(t, err) 5275 5276 // Queue a msg. 5277 sendStreamMsg(t, nc, "foo", "ok") 5278 5279 thresh := 250 * time.Millisecond 5280 5281 // This will start the timer. 5282 sub, err := js.PullSubscribe("foo", "dlc", nats.InactiveThreshold(thresh)) 5283 require_NoError(t, err) 5284 5285 // Switch over leader. 5286 cl := c.consumerLeader(globalAccountName, "TEST", "dlc") 5287 cl.JetStreamStepdownConsumer(globalAccountName, "TEST", "dlc") 5288 c.waitOnConsumerLeader(globalAccountName, "TEST", "dlc") 5289 5290 // Create activity on this consumer. 5291 msgs, err := sub.Fetch(1) 5292 require_NoError(t, err) 5293 require_True(t, len(msgs) == 1) 5294 5295 // This is consider activity as well. So we can watch now up to thresh to make sure consumer still active. 5296 msgs[0].AckSync() 5297 5298 // The consumer should not disappear for next `thresh` interval unless old leader does so. 5299 timeout := time.Now().Add(thresh) 5300 for time.Now().Before(timeout) { 5301 _, err := js.ConsumerInfo("TEST", "dlc") 5302 if err == nats.ErrConsumerNotFound { 5303 t.Fatalf("Consumer deleted when it should not have been") 5304 } 5305 } 5306 } 5307 5308 func TestJetStreamClusterConsumerMaxDeliveryNumAckPendingBug(t *testing.T) { 5309 c := createJetStreamClusterExplicit(t, "R3S", 3) 5310 defer c.shutdown() 5311 5312 nc, js := jsClientConnect(t, c.randomServer()) 5313 defer nc.Close() 5314 5315 _, err := js.AddStream(&nats.StreamConfig{ 5316 Name: "TEST", 5317 Subjects: []string{"*"}, 5318 Replicas: 3, 5319 }) 5320 require_NoError(t, err) 5321 5322 // send 50 msgs 5323 for i := 0; i < 50; i++ { 5324 _, err := js.Publish("foo", []byte("ok")) 5325 require_NoError(t, err) 5326 } 5327 5328 // File based. 5329 _, err = js.Subscribe("foo", 5330 func(msg *nats.Msg) {}, 5331 nats.Durable("file"), 5332 nats.ManualAck(), 5333 nats.MaxDeliver(1), 5334 nats.AckWait(time.Second), 5335 nats.MaxAckPending(10), 5336 ) 5337 require_NoError(t, err) 5338 5339 // Let first batch retry and expire. 5340 time.Sleep(1200 * time.Millisecond) 5341 5342 cia, err := js.ConsumerInfo("TEST", "file") 5343 require_NoError(t, err) 5344 5345 // Make sure followers will have exact same state. 5346 _, err = nc.Request(fmt.Sprintf(JSApiConsumerLeaderStepDownT, "TEST", "file"), nil, time.Second) 5347 require_NoError(t, err) 5348 c.waitOnConsumerLeader(globalAccountName, "TEST", "file") 5349 5350 cib, err := js.ConsumerInfo("TEST", "file") 5351 require_NoError(t, err) 5352 5353 // Want to compare sans cluster details which we know will change due to leader change. 5354 // Also last activity for delivered can be slightly off so nil out as well. 5355 checkConsumerInfo := func(a, b *nats.ConsumerInfo) { 5356 t.Helper() 5357 a.Cluster, b.Cluster = nil, nil 5358 a.Delivered.Last, b.Delivered.Last = nil, nil 5359 if !reflect.DeepEqual(a, b) { 5360 t.Fatalf("ConsumerInfo do not match\n\t%+v\n\t%+v", a, b) 5361 } 5362 } 5363 5364 checkConsumerInfo(cia, cib) 5365 5366 // Memory based. 5367 _, err = js.Subscribe("foo", 5368 func(msg *nats.Msg) {}, 5369 nats.Durable("mem"), 5370 nats.ManualAck(), 5371 nats.MaxDeliver(1), 5372 nats.AckWait(time.Second), 5373 nats.MaxAckPending(10), 5374 nats.ConsumerMemoryStorage(), 5375 ) 5376 require_NoError(t, err) 5377 5378 // Let first batch retry and expire. 5379 time.Sleep(1200 * time.Millisecond) 5380 5381 cia, err = js.ConsumerInfo("TEST", "mem") 5382 require_NoError(t, err) 5383 5384 // Make sure followers will have exact same state. 5385 _, err = nc.Request(fmt.Sprintf(JSApiConsumerLeaderStepDownT, "TEST", "mem"), nil, time.Second) 5386 require_NoError(t, err) 5387 c.waitOnConsumerLeader(globalAccountName, "TEST", "mem") 5388 5389 cib, err = js.ConsumerInfo("TEST", "mem") 5390 require_NoError(t, err) 5391 5392 checkConsumerInfo(cia, cib) 5393 5394 // Now file based but R1 and server restart. 5395 _, err = js.Subscribe("foo", 5396 func(msg *nats.Msg) {}, 5397 nats.Durable("r1"), 5398 nats.ManualAck(), 5399 nats.MaxDeliver(1), 5400 nats.AckWait(time.Second), 5401 nats.MaxAckPending(10), 5402 nats.ConsumerReplicas(1), 5403 ) 5404 require_NoError(t, err) 5405 5406 // Let first batch retry and expire. 5407 time.Sleep(1200 * time.Millisecond) 5408 5409 cia, err = js.ConsumerInfo("TEST", "r1") 5410 require_NoError(t, err) 5411 5412 cl := c.consumerLeader(globalAccountName, "TEST", "r1") 5413 cl.Shutdown() 5414 cl.WaitForShutdown() 5415 cl = c.restartServer(cl) 5416 c.waitOnServerCurrent(cl) 5417 5418 cib, err = js.ConsumerInfo("TEST", "r1") 5419 require_NoError(t, err) 5420 5421 // Created can skew a small bit due to server restart, this is expected. 5422 now := time.Now() 5423 cia.Created, cib.Created = now, now 5424 // Clear any disagreement on push bound. 5425 cia.PushBound, cib.PushBound = false, false 5426 checkConsumerInfo(cia, cib) 5427 } 5428 5429 func TestJetStreamClusterConsumerDefaultsFromStream(t *testing.T) { 5430 c := createJetStreamClusterExplicit(t, "R3S", 3) 5431 defer c.shutdown() 5432 5433 nc, js := jsClientConnect(t, c.randomServer()) 5434 defer nc.Close() 5435 5436 streamTmpl := &StreamConfig{ 5437 Name: "test", 5438 Subjects: []string{"test.*"}, 5439 Storage: MemoryStorage, 5440 ConsumerLimits: StreamConsumerLimits{ 5441 MaxAckPending: 0, 5442 InactiveThreshold: 0, 5443 }, 5444 } 5445 5446 // Since nats.go doesn't yet know about the consumer limits, craft 5447 // the stream configuration request by hand. 5448 streamCreate := func(maxAckPending int, inactiveThreshold time.Duration) (*StreamConfig, error) { 5449 cfg := streamTmpl 5450 cfg.ConsumerLimits = StreamConsumerLimits{ 5451 MaxAckPending: maxAckPending, 5452 InactiveThreshold: inactiveThreshold, 5453 } 5454 j, err := json.Marshal(cfg) 5455 if err != nil { 5456 return nil, err 5457 } 5458 msg, err := nc.Request(fmt.Sprintf(JSApiStreamCreateT, "test"), j, time.Second*3) 5459 if err != nil { 5460 return nil, err 5461 } 5462 var resp JSApiStreamCreateResponse 5463 if err := json.Unmarshal(msg.Data, &resp); err != nil { 5464 return nil, err 5465 } 5466 if resp.StreamInfo == nil { 5467 return nil, resp.ApiResponse.ToError() 5468 } 5469 return &resp.Config, resp.ApiResponse.ToError() 5470 } 5471 streamUpdate := func(maxAckPending int, inactiveThreshold time.Duration) (*StreamConfig, error) { 5472 cfg := streamTmpl 5473 cfg.ConsumerLimits = StreamConsumerLimits{ 5474 MaxAckPending: maxAckPending, 5475 InactiveThreshold: inactiveThreshold, 5476 } 5477 j, err := json.Marshal(cfg) 5478 if err != nil { 5479 return nil, err 5480 } 5481 msg, err := nc.Request(fmt.Sprintf(JSApiStreamUpdateT, "test"), j, time.Second*3) 5482 if err != nil { 5483 return nil, err 5484 } 5485 var resp JSApiStreamUpdateResponse 5486 if err := json.Unmarshal(msg.Data, &resp); err != nil { 5487 return nil, err 5488 } 5489 if resp.StreamInfo == nil { 5490 return nil, resp.ApiResponse.ToError() 5491 } 5492 return &resp.Config, resp.ApiResponse.ToError() 5493 } 5494 5495 if _, err := streamCreate(15, time.Second); err != nil { 5496 t.Fatalf("Failed to add stream: %v", err) 5497 } 5498 5499 t.Run("InheritDefaultsFromStream", func(t *testing.T) { 5500 ci, err := js.AddConsumer("test", &nats.ConsumerConfig{ 5501 Name: "InheritDefaultsFromStream", 5502 }) 5503 require_NoError(t, err) 5504 5505 switch { 5506 case ci.Config.InactiveThreshold != time.Second: 5507 t.Fatalf("InactiveThreshold should be 1s, got %s", ci.Config.InactiveThreshold) 5508 case ci.Config.MaxAckPending != 15: 5509 t.Fatalf("MaxAckPending should be 15, got %d", ci.Config.MaxAckPending) 5510 } 5511 }) 5512 5513 t.Run("CreateConsumerErrorOnExceedMaxAckPending", func(t *testing.T) { 5514 _, err := js.AddConsumer("test", &nats.ConsumerConfig{ 5515 Name: "CreateConsumerErrorOnExceedMaxAckPending", 5516 MaxAckPending: 30, 5517 }) 5518 switch e := err.(type) { 5519 case *nats.APIError: 5520 if ErrorIdentifier(e.ErrorCode) != JSConsumerMaxPendingAckExcessErrF { 5521 t.Fatalf("invalid error code, got %d, wanted %d", e.ErrorCode, JSConsumerMaxPendingAckExcessErrF) 5522 } 5523 default: 5524 t.Fatalf("should have returned API error, got %T", e) 5525 } 5526 }) 5527 5528 t.Run("CreateConsumerErrorOnExceedInactiveThreshold", func(t *testing.T) { 5529 _, err := js.AddConsumer("test", &nats.ConsumerConfig{ 5530 Name: "CreateConsumerErrorOnExceedInactiveThreshold", 5531 InactiveThreshold: time.Second * 2, 5532 }) 5533 switch e := err.(type) { 5534 case *nats.APIError: 5535 if ErrorIdentifier(e.ErrorCode) != JSConsumerInactiveThresholdExcess { 5536 t.Fatalf("invalid error code, got %d, wanted %d", e.ErrorCode, JSConsumerInactiveThresholdExcess) 5537 } 5538 default: 5539 t.Fatalf("should have returned API error, got %T", e) 5540 } 5541 }) 5542 5543 t.Run("UpdateStreamErrorOnViolateConsumerMaxAckPending", func(t *testing.T) { 5544 _, err := js.AddConsumer("test", &nats.ConsumerConfig{ 5545 Name: "UpdateStreamErrorOnViolateConsumerMaxAckPending", 5546 MaxAckPending: 15, 5547 }) 5548 require_NoError(t, err) 5549 5550 if _, err = streamUpdate(10, 0); err == nil { 5551 t.Fatalf("stream update should have errored but didn't") 5552 } 5553 }) 5554 5555 t.Run("UpdateStreamErrorOnViolateConsumerInactiveThreshold", func(t *testing.T) { 5556 _, err := js.AddConsumer("test", &nats.ConsumerConfig{ 5557 Name: "UpdateStreamErrorOnViolateConsumerInactiveThreshold", 5558 InactiveThreshold: time.Second, 5559 }) 5560 require_NoError(t, err) 5561 5562 if _, err = streamUpdate(0, time.Second/2); err == nil { 5563 t.Fatalf("stream update should have errored but didn't") 5564 } 5565 }) 5566 } 5567 5568 // Discovered that we are not properly setting certain default filestore blkSizes. 5569 func TestJetStreamClusterCheckFileStoreBlkSizes(t *testing.T) { 5570 c := createJetStreamClusterExplicit(t, "R3S", 3) 5571 defer c.shutdown() 5572 5573 nc, js := jsClientConnect(t, c.randomServer()) 5574 defer nc.Close() 5575 5576 // Normal Stream 5577 _, err := js.AddStream(&nats.StreamConfig{ 5578 Name: "TEST", 5579 Subjects: []string{"*"}, 5580 Replicas: 3, 5581 }) 5582 require_NoError(t, err) 5583 5584 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5585 Durable: "C3", 5586 AckPolicy: nats.AckExplicitPolicy, 5587 }) 5588 require_NoError(t, err) 5589 5590 // KV 5591 _, err = js.CreateKeyValue(&nats.KeyValueConfig{ 5592 Bucket: "TEST", 5593 Replicas: 3, 5594 }) 5595 require_NoError(t, err) 5596 5597 blkSize := func(fs *fileStore) uint64 { 5598 fs.mu.RLock() 5599 defer fs.mu.RUnlock() 5600 return fs.fcfg.BlockSize 5601 } 5602 5603 // We will check now the following filestores. 5604 // meta 5605 // TEST stream and NRG 5606 // C3 NRG 5607 // KV_TEST stream and NRG 5608 for _, s := range c.servers { 5609 js, cc := s.getJetStreamCluster() 5610 // META 5611 js.mu.RLock() 5612 meta := cc.meta 5613 js.mu.RUnlock() 5614 require_True(t, meta != nil) 5615 fs := meta.(*raft).wal.(*fileStore) 5616 require_True(t, blkSize(fs) == defaultMetaFSBlkSize) 5617 5618 // TEST STREAM 5619 mset, err := s.GlobalAccount().lookupStream("TEST") 5620 require_NoError(t, err) 5621 mset.mu.RLock() 5622 fs = mset.store.(*fileStore) 5623 mset.mu.RUnlock() 5624 require_True(t, blkSize(fs) == defaultLargeBlockSize) 5625 5626 // KV STREAM 5627 // Now the KV which is different default size. 5628 kv, err := s.GlobalAccount().lookupStream("KV_TEST") 5629 require_NoError(t, err) 5630 kv.mu.RLock() 5631 fs = kv.store.(*fileStore) 5632 kv.mu.RUnlock() 5633 require_True(t, blkSize(fs) == defaultKVBlockSize) 5634 5635 // Now check NRGs 5636 // TEST Stream 5637 n := mset.raftNode() 5638 require_True(t, n != nil) 5639 fs = n.(*raft).wal.(*fileStore) 5640 require_True(t, blkSize(fs) == defaultMediumBlockSize) 5641 // KV TEST Stream 5642 n = kv.raftNode() 5643 require_True(t, n != nil) 5644 fs = n.(*raft).wal.(*fileStore) 5645 require_True(t, blkSize(fs) == defaultMediumBlockSize) 5646 // Consumer 5647 o := mset.lookupConsumer("C3") 5648 require_True(t, o != nil) 5649 n = o.raftNode() 5650 require_True(t, n != nil) 5651 fs = n.(*raft).wal.(*fileStore) 5652 require_True(t, blkSize(fs) == defaultMediumBlockSize) 5653 } 5654 } 5655 5656 func TestJetStreamClusterDetectOrphanNRGs(t *testing.T) { 5657 c := createJetStreamClusterExplicit(t, "R3S", 3) 5658 defer c.shutdown() 5659 5660 nc, js := jsClientConnect(t, c.randomServer()) 5661 defer nc.Close() 5662 5663 // Normal Stream 5664 _, err := js.AddStream(&nats.StreamConfig{ 5665 Name: "TEST", 5666 Subjects: []string{"*"}, 5667 Replicas: 3, 5668 }) 5669 require_NoError(t, err) 5670 5671 _, err = js.AddConsumer("TEST", &nats.ConsumerConfig{ 5672 Durable: "DC", 5673 AckPolicy: nats.AckExplicitPolicy, 5674 }) 5675 require_NoError(t, err) 5676 5677 // We will force an orphan for a certain server. 5678 s := c.randomNonStreamLeader(globalAccountName, "TEST") 5679 5680 mset, err := s.GlobalAccount().lookupStream("TEST") 5681 require_NoError(t, err) 5682 sgn := mset.raftNode().Group() 5683 mset.clearRaftNode() 5684 5685 o := mset.lookupConsumer("DC") 5686 require_True(t, o != nil) 5687 ogn := o.raftNode().Group() 5688 o.clearRaftNode() 5689 5690 require_NoError(t, js.DeleteStream("TEST")) 5691 5692 // Check that we do in fact have orphans. 5693 require_True(t, s.numRaftNodes() > 1) 5694 5695 // This function will detect orphans and clean them up. 5696 s.checkForNRGOrphans() 5697 5698 // Should only be meta NRG left. 5699 require_True(t, s.numRaftNodes() == 1) 5700 require_True(t, s.lookupRaftNode(sgn) == nil) 5701 require_True(t, s.lookupRaftNode(ogn) == nil) 5702 } 5703 5704 func TestJetStreamClusterRestartThenScaleStreamReplicas(t *testing.T) { 5705 t.Skip("This test takes too long, need to make shorter") 5706 5707 c := createJetStreamClusterExplicit(t, "R3S", 3) 5708 defer c.shutdown() 5709 5710 s := c.randomNonLeader() 5711 nc, js := jsClientConnect(t, s) 5712 defer nc.Close() 5713 5714 nc2, producer := jsClientConnect(t, s) 5715 defer nc2.Close() 5716 5717 _, err := js.AddStream(&nats.StreamConfig{ 5718 Name: "TEST", 5719 Subjects: []string{"foo"}, 5720 Replicas: 3, 5721 }) 5722 require_NoError(t, err) 5723 c.waitOnStreamLeader(globalAccountName, "TEST") 5724 5725 ctx, cancel := context.WithCancel(context.Background()) 5726 defer cancel() 5727 5728 end := time.Now().Add(2 * time.Second) 5729 for time.Now().Before(end) { 5730 producer.Publish("foo", []byte(strings.Repeat("A", 128))) 5731 time.Sleep(time.Millisecond) 5732 } 5733 5734 var wg sync.WaitGroup 5735 for i := 0; i < 5; i++ { 5736 sub, err := js.PullSubscribe("foo", fmt.Sprintf("C-%d", i)) 5737 require_NoError(t, err) 5738 5739 wg.Add(1) 5740 go func() { 5741 defer wg.Done() 5742 for range time.NewTicker(10 * time.Millisecond).C { 5743 select { 5744 case <-ctx.Done(): 5745 return 5746 default: 5747 } 5748 5749 msgs, err := sub.Fetch(1) 5750 if err != nil && !errors.Is(err, nats.ErrTimeout) && !errors.Is(err, nats.ErrConnectionClosed) { 5751 t.Logf("Pull Error: %v", err) 5752 } 5753 for _, msg := range msgs { 5754 msg.Ack() 5755 } 5756 } 5757 }() 5758 } 5759 c.lameDuckRestartAll() 5760 c.waitOnStreamLeader(globalAccountName, "TEST") 5761 5762 // Swap the logger to try to detect the condition after the restart. 5763 loggers := make([]*captureDebugLogger, 3) 5764 for i, srv := range c.servers { 5765 l := &captureDebugLogger{dbgCh: make(chan string, 10)} 5766 loggers[i] = l 5767 srv.SetLogger(l, true, false) 5768 } 5769 condition := `Direct proposal ignored, not leader (state: CLOSED)` 5770 errCh := make(chan error, 10) 5771 5772 wg.Add(1) 5773 go func() { 5774 defer wg.Done() 5775 for { 5776 select { 5777 case dl := <-loggers[0].dbgCh: 5778 if strings.Contains(dl, condition) { 5779 errCh <- fmt.Errorf(condition) 5780 } 5781 case dl := <-loggers[1].dbgCh: 5782 if strings.Contains(dl, condition) { 5783 errCh <- fmt.Errorf(condition) 5784 } 5785 case dl := <-loggers[2].dbgCh: 5786 if strings.Contains(dl, condition) { 5787 errCh <- fmt.Errorf(condition) 5788 } 5789 case <-ctx.Done(): 5790 return 5791 } 5792 } 5793 }() 5794 5795 // Start publishing again for a while. 5796 end = time.Now().Add(2 * time.Second) 5797 for time.Now().Before(end) { 5798 producer.Publish("foo", []byte(strings.Repeat("A", 128))) 5799 time.Sleep(time.Millisecond) 5800 } 5801 5802 // Try to do a stream edit back to R=1 after doing all the upgrade. 5803 info, _ := js.StreamInfo("TEST") 5804 sconfig := info.Config 5805 sconfig.Replicas = 1 5806 _, err = js.UpdateStream(&sconfig) 5807 require_NoError(t, err) 5808 5809 // Leave running for some time after the update. 5810 time.Sleep(2 * time.Second) 5811 5812 info, _ = js.StreamInfo("TEST") 5813 sconfig = info.Config 5814 sconfig.Replicas = 3 5815 _, err = js.UpdateStream(&sconfig) 5816 require_NoError(t, err) 5817 5818 select { 5819 case e := <-errCh: 5820 t.Fatalf("Bad condition on raft node: %v", e) 5821 case <-time.After(2 * time.Second): 5822 // Done 5823 } 5824 5825 // Stop goroutines and wait for them to exit. 5826 cancel() 5827 wg.Wait() 5828 } 5829 5830 // https://github.com/nats-io/nats-server/issues/4732 5831 func TestJetStreamClusterStreamLimitsOnScaleUpAndMove(t *testing.T) { 5832 tmpl := ` 5833 listen: 127.0.0.1:-1 5834 server_name: %s 5835 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 5836 cluster { 5837 name: %s 5838 listen: 127.0.0.1:%d 5839 routes = [%s] 5840 } 5841 ` 5842 opFrag := ` 5843 operator: %s 5844 system_account: %s 5845 resolver: { type: MEM } 5846 resolver_preload = { 5847 %s : %s 5848 %s : %s 5849 } 5850 ` 5851 5852 _, syspub := createKey(t) 5853 sysJwt := encodeClaim(t, jwt.NewAccountClaims(syspub), syspub) 5854 5855 accKp, aExpPub := createKey(t) 5856 accClaim := jwt.NewAccountClaims(aExpPub) 5857 accClaim.Limits.JetStreamTieredLimits["R1"] = jwt.JetStreamLimits{ 5858 DiskStorage: -1, Consumer: -1, Streams: 1} 5859 accClaim.Limits.JetStreamTieredLimits["R3"] = jwt.JetStreamLimits{ 5860 DiskStorage: 0, Consumer: -1, Streams: 1} 5861 accJwt := encodeClaim(t, accClaim, aExpPub) 5862 accCreds := newUser(t, accKp) 5863 5864 template := tmpl + fmt.Sprintf(opFrag, ojwt, syspub, syspub, sysJwt, aExpPub, accJwt) 5865 5866 c := createJetStreamCluster(t, template, "CLOUD", _EMPTY_, 3, 22020, true) 5867 defer c.shutdown() 5868 5869 nc, js := jsClientConnect(t, c.randomServer(), nats.UserCredentials(accCreds)) 5870 defer nc.Close() 5871 5872 _, err := js.AddStream(&nats.StreamConfig{ 5873 Name: "TEST", 5874 Subjects: []string{"foo"}, 5875 }) 5876 require_NoError(t, err) 5877 5878 toSend, msg := 100, bytes.Repeat([]byte("Z"), 1024) 5879 for i := 0; i < toSend; i++ { 5880 _, err := js.PublishAsync("foo", msg) 5881 require_NoError(t, err) 5882 } 5883 select { 5884 case <-js.PublishAsyncComplete(): 5885 case <-time.After(5 * time.Second): 5886 t.Fatalf("Did not receive completion signal") 5887 } 5888 5889 // Scale up should fail here since no R3 storage. 5890 _, err = js.UpdateStream(&nats.StreamConfig{ 5891 Name: "TEST", 5892 Subjects: []string{"foo"}, 5893 Replicas: 3, 5894 }) 5895 require_Error(t, err, errors.New("insufficient storage resources")) 5896 } 5897 5898 func TestJetStreamClusterAPIAccessViaSystemAccount(t *testing.T) { 5899 c := createJetStreamClusterExplicit(t, "R3S", 3) 5900 defer c.shutdown() 5901 5902 // Connect to system account. 5903 nc, js := jsClientConnect(t, c.randomServer(), nats.UserInfo("admin", "s3cr3t!")) 5904 defer nc.Close() 5905 5906 _, err := js.AddStream(&nats.StreamConfig{Name: "TEST"}) 5907 require_Error(t, err, NewJSNotEnabledForAccountError()) 5908 5909 // Make sure same behavior swith single server. 5910 tmpl := ` 5911 listen: 127.0.0.1:-1 5912 jetstream: {max_mem_store: 256MB, max_file_store: 2GB, store_dir: '%s'} 5913 accounts { $SYS { users = [ { user: "admin", pass: "s3cr3t!" } ] } } 5914 ` 5915 conf := createConfFile(t, []byte(fmt.Sprintf(tmpl, t.TempDir()))) 5916 s, _ := RunServerWithConfig(conf) 5917 defer s.Shutdown() 5918 5919 nc, js = jsClientConnect(t, s, nats.UserInfo("admin", "s3cr3t!")) 5920 defer nc.Close() 5921 5922 _, err = js.AddStream(&nats.StreamConfig{Name: "TEST"}) 5923 require_Error(t, err, NewJSNotEnabledForAccountError()) 5924 } 5925 5926 func TestJetStreamClusterStreamResetPreacks(t *testing.T) { 5927 c := createJetStreamClusterExplicit(t, "R3S", 3) 5928 defer c.shutdown() 5929 5930 nc, js := jsClientConnect(t, c.randomServer()) 5931 defer nc.Close() 5932 5933 _, err := js.AddStream(&nats.StreamConfig{ 5934 Name: "TEST", 5935 Subjects: []string{"foo"}, 5936 Retention: nats.InterestPolicy, 5937 Replicas: 3, 5938 }) 5939 require_NoError(t, err) 5940 5941 err = js.PurgeStream("TEST", &nats.StreamPurgeRequest{Sequence: 100_000_000}) 5942 require_NoError(t, err) 5943 5944 sub, err := js.PullSubscribe("foo", "dlc") 5945 require_NoError(t, err) 5946 5947 // Put 20 msgs in. 5948 for i := 0; i < 20; i++ { 5949 _, err := js.Publish("foo", nil) 5950 require_NoError(t, err) 5951 } 5952 5953 // Consume and ack 10. 5954 msgs, err := sub.Fetch(10, nats.MaxWait(time.Second)) 5955 require_NoError(t, err) 5956 require_Equal(t, len(msgs), 10) 5957 5958 for _, msg := range msgs { 5959 msg.AckSync() 5960 } 5961 5962 // Now grab a non-leader server. 5963 // We will shut it down and remove the stream data. 5964 nl := c.randomNonStreamLeader(globalAccountName, "TEST") 5965 mset, err := nl.GlobalAccount().lookupStream("TEST") 5966 require_NoError(t, err) 5967 fs := mset.store.(*fileStore) 5968 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 5969 nl.Shutdown() 5970 // In case that was the consumer leader. 5971 c.waitOnConsumerLeader(globalAccountName, "TEST", "dlc") 5972 5973 // Now consume the remaining 10 and ack. 5974 msgs, err = sub.Fetch(10, nats.MaxWait(10*time.Second)) 5975 require_NoError(t, err) 5976 require_Equal(t, len(msgs), 10) 5977 5978 for _, msg := range msgs { 5979 msg.AckSync() 5980 } 5981 5982 // Now remove the stream manually. 5983 require_NoError(t, os.RemoveAll(mdir)) 5984 nl = c.restartServer(nl) 5985 c.waitOnAllCurrent() 5986 5987 mset, err = nl.GlobalAccount().lookupStream("TEST") 5988 require_NoError(t, err) 5989 5990 checkFor(t, 10*time.Second, 200*time.Millisecond, func() error { 5991 state := mset.state() 5992 if state.Msgs != 0 || state.FirstSeq != 100_000_020 { 5993 return fmt.Errorf("Not correct state yet: %+v", state) 5994 } 5995 return nil 5996 }) 5997 } 5998 5999 func TestJetStreamClusterDomainAdvisory(t *testing.T) { 6000 tmpl := strings.Replace(jsClusterAccountsTempl, "store_dir:", "domain: NGS, store_dir:", 1) 6001 c := createJetStreamCluster(t, tmpl, "R3S", _EMPTY_, 3, 18033, true) 6002 defer c.shutdown() 6003 6004 // Connect to system account. 6005 nc, _ := jsClientConnect(t, c.randomServer(), nats.UserInfo("admin", "s3cr3t!")) 6006 defer nc.Close() 6007 6008 sub, err := nc.SubscribeSync(JSAdvisoryDomainLeaderElected) 6009 require_NoError(t, err) 6010 6011 // Ask meta leader to move and make sure we get an advisory. 6012 nc.Request(JSApiLeaderStepDown, nil, time.Second) 6013 c.waitOnLeader() 6014 6015 checkSubsPending(t, sub, 1) 6016 6017 m, err := sub.NextMsg(time.Second) 6018 require_NoError(t, err) 6019 6020 var adv JSDomainLeaderElectedAdvisory 6021 require_NoError(t, json.Unmarshal(m.Data, &adv)) 6022 6023 ml := c.leader() 6024 js, cc := ml.getJetStreamCluster() 6025 js.mu.RLock() 6026 peer := cc.meta.ID() 6027 js.mu.RUnlock() 6028 6029 require_Equal(t, adv.Leader, peer) 6030 require_Equal(t, adv.Domain, "NGS") 6031 require_Equal(t, adv.Cluster, "R3S") 6032 require_Equal(t, len(adv.Replicas), 3) 6033 } 6034 6035 func TestJetStreamClusterLimitsBasedStreamFileStoreDesync(t *testing.T) { 6036 conf := ` 6037 listen: 127.0.0.1:-1 6038 server_name: %s 6039 jetstream: { 6040 store_dir: '%s', 6041 } 6042 cluster { 6043 name: %s 6044 listen: 127.0.0.1:%d 6045 routes = [%s] 6046 } 6047 system_account: sys 6048 no_auth_user: js 6049 accounts { 6050 sys { 6051 users = [ 6052 { user: sys, pass: sys } 6053 ] 6054 } 6055 js { 6056 jetstream = { store_max_stream_bytes = 3mb } 6057 users = [ 6058 { user: js, pass: js } 6059 ] 6060 } 6061 }` 6062 c := createJetStreamClusterWithTemplate(t, conf, "limits", 3) 6063 defer c.shutdown() 6064 6065 nc, js := jsClientConnect(t, c.randomServer()) 6066 defer nc.Close() 6067 6068 cnc, cjs := jsClientConnect(t, c.randomServer()) 6069 defer cnc.Close() 6070 6071 _, err := js.AddStream(&nats.StreamConfig{ 6072 Name: "LTEST", 6073 Subjects: []string{"messages.*"}, 6074 Replicas: 3, 6075 MaxAge: 10 * time.Minute, 6076 MaxMsgs: 100_000, 6077 }) 6078 require_NoError(t, err) 6079 6080 ctx, cancel := context.WithCancel(context.Background()) 6081 defer cancel() 6082 6083 psub, err := cjs.PullSubscribe("messages.*", "consumer") 6084 require_NoError(t, err) 6085 6086 var ( 6087 wg sync.WaitGroup 6088 received uint64 6089 errCh = make(chan error, 100_000) 6090 receivedMap = make(map[string]*nats.Msg) 6091 ) 6092 wg.Add(1) 6093 go func() { 6094 tick := time.NewTicker(20 * time.Millisecond) 6095 for { 6096 select { 6097 case <-ctx.Done(): 6098 wg.Done() 6099 return 6100 case <-tick.C: 6101 msgs, err := psub.Fetch(10, nats.MaxWait(200*time.Millisecond)) 6102 if err != nil { 6103 continue 6104 } 6105 for _, msg := range msgs { 6106 received++ 6107 receivedMap[msg.Subject] = msg 6108 if meta, _ := msg.Metadata(); meta.NumDelivered > 1 { 6109 t.Logf("GOT MSG: %s :: %+v :: %d", msg.Subject, meta, len(msg.Data)) 6110 } 6111 msg.Ack() 6112 } 6113 } 6114 } 6115 }() 6116 6117 // Send 20_000 msgs at roughly 1 msg per msec 6118 shouldDrop := make(map[string]error) 6119 wg.Add(1) 6120 go func() { 6121 payload := []byte(strings.Repeat("A", 1024)) 6122 tick := time.NewTicker(1 * time.Millisecond) 6123 for i := 1; i < 100_000; { 6124 select { 6125 case <-ctx.Done(): 6126 wg.Done() 6127 return 6128 case <-tick.C: 6129 // This should run into 3MB quota and get errors right away 6130 // before the max msgs limit does. 6131 subject := fmt.Sprintf("messages.%d", i) 6132 _, err := js.Publish(subject, payload, nats.RetryAttempts(0)) 6133 if err != nil { 6134 errCh <- err 6135 } 6136 i++ 6137 6138 // Any message over this number should not be a success 6139 // since the stream should be full due to the quota. 6140 // Here we capture that the messages have failed to confirm. 6141 if err != nil && i > 1000 { 6142 shouldDrop[subject] = err 6143 } 6144 } 6145 } 6146 }() 6147 6148 // Collect enough errors to cause things to get out of sync. 6149 var errCount int 6150 Setup: 6151 for { 6152 select { 6153 case err = <-errCh: 6154 errCount++ 6155 if errCount >= 20_000 { 6156 // Stop both producing and consuming. 6157 cancel() 6158 break Setup 6159 } 6160 case <-time.After(5 * time.Second): 6161 t.Fatalf("Timed out waiting for limits error") 6162 } 6163 } 6164 6165 // Both goroutines should be exiting now.. 6166 wg.Wait() 6167 6168 // Check messages that ought to have been dropped. 6169 for subject := range receivedMap { 6170 found, ok := shouldDrop[subject] 6171 if ok { 6172 t.Errorf("Should have dropped message published on %q since got error: %v", subject, found) 6173 } 6174 } 6175 6176 getStreamDetails := func(t *testing.T, srv *Server) *StreamDetail { 6177 t.Helper() 6178 jsz, err := srv.Jsz(&JSzOptions{Accounts: true, Streams: true, Consumer: true}) 6179 require_NoError(t, err) 6180 if len(jsz.AccountDetails) > 0 && len(jsz.AccountDetails[0].Streams) > 0 { 6181 details := jsz.AccountDetails[0] 6182 stream := details.Streams[0] 6183 return &stream 6184 } 6185 t.Error("Could not find account details") 6186 return nil 6187 } 6188 checkState := func(t *testing.T) error { 6189 t.Helper() 6190 6191 leaderSrv := c.streamLeader("js", "LTEST") 6192 streamLeader := getStreamDetails(t, leaderSrv) 6193 // t.Logf("Stream Leader: %+v", streamLeader.State) 6194 errs := make([]error, 0) 6195 for _, srv := range c.servers { 6196 if srv == leaderSrv { 6197 // Skip self 6198 continue 6199 } 6200 stream := getStreamDetails(t, srv) 6201 if stream.State.Msgs != streamLeader.State.Msgs { 6202 err := fmt.Errorf("Leader %v has %d messages, Follower %v has %d messages", 6203 stream.Cluster.Leader, streamLeader.State.Msgs, 6204 srv.Name(), stream.State.Msgs, 6205 ) 6206 errs = append(errs, err) 6207 } 6208 } 6209 if len(errs) > 0 { 6210 return errors.Join(errs...) 6211 } 6212 return nil 6213 } 6214 6215 // Confirm state of the leader. 6216 leaderSrv := c.streamLeader("js", "LTEST") 6217 streamLeader := getStreamDetails(t, leaderSrv) 6218 if streamLeader.State.Msgs != received { 6219 t.Errorf("Leader %v has %d messages stored but %d messages were received (delta: %d)", 6220 leaderSrv.Name(), streamLeader.State.Msgs, received, received-streamLeader.State.Msgs) 6221 } 6222 cinfo, err := psub.ConsumerInfo() 6223 require_NoError(t, err) 6224 if received != cinfo.Delivered.Consumer { 6225 t.Errorf("Unexpected consumer sequence. Got: %v, expected: %v", 6226 cinfo.Delivered.Consumer, received) 6227 } 6228 6229 // Check whether there was a drift among the leader and followers. 6230 var ( 6231 lastErr error 6232 attempts int 6233 ) 6234 Check: 6235 for range time.NewTicker(1 * time.Second).C { 6236 lastErr = checkState(t) 6237 if attempts > 5 { 6238 break Check 6239 } 6240 attempts++ 6241 } 6242 6243 // Read the stream 6244 psub2, err := cjs.PullSubscribe("messages.*", "") 6245 require_NoError(t, err) 6246 6247 Consume2: 6248 for { 6249 msgs, err := psub2.Fetch(100) 6250 if err != nil { 6251 continue 6252 } 6253 for _, msg := range msgs { 6254 msg.Ack() 6255 6256 meta, _ := msg.Metadata() 6257 if meta.NumPending == 0 { 6258 break Consume2 6259 } 6260 } 6261 } 6262 6263 cinfo2, err := psub2.ConsumerInfo() 6264 require_NoError(t, err) 6265 6266 a := cinfo.Delivered.Consumer 6267 b := cinfo2.Delivered.Consumer 6268 if a != b { 6269 t.Errorf("Consumers to same stream are at different sequences: %d vs %d", a, b) 6270 } 6271 6272 // Test is done but replicas were in sync so can stop testing at this point. 6273 if lastErr == nil { 6274 return 6275 } 6276 6277 // Now we will cause a few step downs while out of sync to get different results. 6278 t.Errorf("Replicas are out of sync:\n%v", lastErr) 6279 6280 stepDown := func() { 6281 _, err = nc.Request(fmt.Sprintf(JSApiStreamLeaderStepDownT, "LTEST"), nil, time.Second) 6282 } 6283 // Check StreamInfo in this state then trigger a few step downs. 6284 var prevLeaderMsgs uint64 6285 leaderSrv = c.streamLeader("js", "LTEST") 6286 sinfo, err := js.StreamInfo("LTEST") 6287 prevLeaderMsgs = sinfo.State.Msgs 6288 for i := 0; i < 10; i++ { 6289 stepDown() 6290 time.Sleep(2 * time.Second) 6291 6292 leaderSrv = c.streamLeader("js", "LTEST") 6293 sinfo, err = js.StreamInfo("LTEST") 6294 if err != nil { 6295 t.Logf("Error: %v", err) 6296 continue 6297 } 6298 if leaderSrv != nil && sinfo != nil { 6299 t.Logf("When leader is %v, Messages: %d", leaderSrv.Name(), sinfo.State.Msgs) 6300 6301 // Leave the leader as the replica with less messages that was out of sync. 6302 if prevLeaderMsgs > sinfo.State.Msgs { 6303 break 6304 } 6305 } 6306 } 6307 t.Logf("Changed to use leader %v which has %d messages", leaderSrv.Name(), sinfo.State.Msgs) 6308 6309 // Read the stream again 6310 psub3, err := cjs.PullSubscribe("messages.*", "") 6311 require_NoError(t, err) 6312 6313 Consume3: 6314 for { 6315 msgs, err := psub3.Fetch(100) 6316 if err != nil { 6317 continue 6318 } 6319 for _, msg := range msgs { 6320 msg.Ack() 6321 6322 meta, _ := msg.Metadata() 6323 if meta.NumPending == 0 { 6324 break Consume3 6325 } 6326 } 6327 } 6328 6329 cinfo3, err := psub3.ConsumerInfo() 6330 require_NoError(t, err) 6331 6332 // Compare against consumer that was created before resource limits error 6333 // with one created before the step down. 6334 a = cinfo.Delivered.Consumer 6335 b = cinfo2.Delivered.Consumer 6336 if a != b { 6337 t.Errorf("Consumers to same stream are at different sequences: %d vs %d", a, b) 6338 } 6339 6340 // Compare against consumer that was created before resource limits error 6341 // with one created AFTER the step down. 6342 a = cinfo.Delivered.Consumer 6343 b = cinfo3.Delivered.Consumer 6344 if a != b { 6345 t.Errorf("Consumers to same stream are at different sequences: %d vs %d", a, b) 6346 } 6347 6348 // Compare consumers created after the resource limits error. 6349 a = cinfo2.Delivered.Consumer 6350 b = cinfo3.Delivered.Consumer 6351 if a != b { 6352 t.Errorf("Consumers to same stream are at different sequences: %d vs %d", a, b) 6353 } 6354 }