get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/server/jetstream_cluster.go (about) 1 // Copyright 2020-2024 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package server 15 16 import ( 17 "bytes" 18 crand "crypto/rand" 19 "encoding/binary" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "math" 24 "math/rand" 25 "os" 26 "path/filepath" 27 "reflect" 28 "sort" 29 "strings" 30 "sync/atomic" 31 "time" 32 33 "github.com/klauspost/compress/s2" 34 "github.com/minio/highwayhash" 35 "github.com/nats-io/nuid" 36 ) 37 38 // jetStreamCluster holds information about the meta group and stream assignments. 39 type jetStreamCluster struct { 40 // The metacontroller raftNode. 41 meta RaftNode 42 // For stream and consumer assignments. All servers will have this be the same. 43 // ACCOUNT -> STREAM -> Stream Assignment -> Consumers 44 streams map[string]map[string]*streamAssignment 45 // These are inflight proposals and used to apply limits when there are 46 // concurrent requests that would otherwise be accepted. 47 // We also record the group for the stream. This is needed since if we have 48 // concurrent requests for same account and stream we need to let it process to get 49 // a response but they need to be same group, peers etc. 50 inflight map[string]map[string]*raftGroup 51 // Signals meta-leader should check the stream assignments. 52 streamsCheck bool 53 // Server. 54 s *Server 55 // Internal client. 56 c *client 57 // Processing assignment results. 58 streamResults *subscription 59 consumerResults *subscription 60 // System level request to have the leader stepdown. 61 stepdown *subscription 62 // System level requests to remove a peer. 63 peerRemove *subscription 64 // System level request to move a stream 65 peerStreamMove *subscription 66 // System level request to cancel a stream move 67 peerStreamCancelMove *subscription 68 // To pop out the monitorCluster before the raft layer. 69 qch chan struct{} 70 } 71 72 // Used to guide placement of streams and meta controllers in clustered JetStream. 73 type Placement struct { 74 Cluster string `json:"cluster,omitempty"` 75 Tags []string `json:"tags,omitempty"` 76 } 77 78 // Define types of the entry. 79 type entryOp uint8 80 81 // ONLY ADD TO THE END, DO NOT INSERT IN BETWEEN WILL BREAK SERVER INTEROP. 82 const ( 83 // Meta ops. 84 assignStreamOp entryOp = iota 85 assignConsumerOp 86 removeStreamOp 87 removeConsumerOp 88 // Stream ops. 89 streamMsgOp 90 purgeStreamOp 91 deleteMsgOp 92 // Consumer ops. 93 updateDeliveredOp 94 updateAcksOp 95 // Compressed consumer assignments. 96 assignCompressedConsumerOp 97 // Filtered Consumer skip. 98 updateSkipOp 99 // Update Stream. 100 updateStreamOp 101 // For updating information on pending pull requests. 102 addPendingRequest 103 removePendingRequest 104 // For sending compressed streams, either through RAFT or catchup. 105 compressedStreamMsgOp 106 // For sending deleted gaps on catchups for replicas. 107 deleteRangeOp 108 ) 109 110 // raftGroups are controlled by the metagroup controller. 111 // The raftGroups will house streams and consumers. 112 type raftGroup struct { 113 Name string `json:"name"` 114 Peers []string `json:"peers"` 115 Storage StorageType `json:"store"` 116 Cluster string `json:"cluster,omitempty"` 117 Preferred string `json:"preferred,omitempty"` 118 // Internal 119 node RaftNode 120 } 121 122 // streamAssignment is what the meta controller uses to assign streams to peers. 123 type streamAssignment struct { 124 Client *ClientInfo `json:"client,omitempty"` 125 Created time.Time `json:"created"` 126 Config *StreamConfig `json:"stream"` 127 Group *raftGroup `json:"group"` 128 Sync string `json:"sync"` 129 Subject string `json:"subject"` 130 Reply string `json:"reply"` 131 Restore *StreamState `json:"restore_state,omitempty"` 132 // Internal 133 consumers map[string]*consumerAssignment 134 responded bool 135 recovering bool 136 err error 137 } 138 139 // consumerAssignment is what the meta controller uses to assign consumers to streams. 140 type consumerAssignment struct { 141 Client *ClientInfo `json:"client,omitempty"` 142 Created time.Time `json:"created"` 143 Name string `json:"name"` 144 Stream string `json:"stream"` 145 Config *ConsumerConfig `json:"consumer"` 146 Group *raftGroup `json:"group"` 147 Subject string `json:"subject"` 148 Reply string `json:"reply"` 149 State *ConsumerState `json:"state,omitempty"` 150 // Internal 151 responded bool 152 recovering bool 153 deleted bool 154 err error 155 } 156 157 // streamPurge is what the stream leader will replicate when purging a stream. 158 type streamPurge struct { 159 Client *ClientInfo `json:"client,omitempty"` 160 Stream string `json:"stream"` 161 LastSeq uint64 `json:"last_seq"` 162 Subject string `json:"subject"` 163 Reply string `json:"reply"` 164 Request *JSApiStreamPurgeRequest `json:"request,omitempty"` 165 } 166 167 // streamMsgDelete is what the stream leader will replicate when deleting a message. 168 type streamMsgDelete struct { 169 Client *ClientInfo `json:"client,omitempty"` 170 Stream string `json:"stream"` 171 Seq uint64 `json:"seq"` 172 NoErase bool `json:"no_erase,omitempty"` 173 Subject string `json:"subject"` 174 Reply string `json:"reply"` 175 } 176 177 const ( 178 defaultStoreDirName = "_js_" 179 defaultMetaGroupName = "_meta_" 180 defaultMetaFSBlkSize = 1024 * 1024 181 jsExcludePlacement = "!jetstream" 182 ) 183 184 // Returns information useful in mixed mode. 185 func (s *Server) trackedJetStreamServers() (js, total int) { 186 s.mu.RLock() 187 defer s.mu.RUnlock() 188 if !s.isRunning() || !s.eventsEnabled() { 189 return -1, -1 190 } 191 s.nodeToInfo.Range(func(k, v interface{}) bool { 192 si := v.(nodeInfo) 193 if si.js { 194 js++ 195 } 196 total++ 197 return true 198 }) 199 return js, total 200 } 201 202 func (s *Server) getJetStreamCluster() (*jetStream, *jetStreamCluster) { 203 if s.isShuttingDown() { 204 return nil, nil 205 } 206 207 js := s.getJetStream() 208 if js == nil { 209 return nil, nil 210 } 211 212 // Only set once, do not need a lock. 213 return js, js.cluster 214 } 215 216 func (s *Server) JetStreamIsClustered() bool { 217 js := s.getJetStream() 218 if js == nil { 219 return false 220 } 221 return js.isClustered() 222 } 223 224 func (s *Server) JetStreamIsLeader() bool { 225 return s.isMetaLeader.Load() 226 } 227 228 func (s *Server) JetStreamIsCurrent() bool { 229 js := s.getJetStream() 230 if js == nil { 231 return false 232 } 233 // Grab what we need and release js lock. 234 js.mu.RLock() 235 var meta RaftNode 236 cc := js.cluster 237 if cc != nil { 238 meta = cc.meta 239 } 240 js.mu.RUnlock() 241 242 if cc == nil { 243 // Non-clustered mode 244 return true 245 } 246 return meta.Current() 247 } 248 249 func (s *Server) JetStreamSnapshotMeta() error { 250 js := s.getJetStream() 251 if js == nil { 252 return NewJSNotEnabledError() 253 } 254 js.mu.RLock() 255 cc := js.cluster 256 isLeader := cc.isLeader() 257 meta := cc.meta 258 js.mu.RUnlock() 259 260 if !isLeader { 261 return errNotLeader 262 } 263 264 return meta.InstallSnapshot(js.metaSnapshot()) 265 } 266 267 func (s *Server) JetStreamStepdownStream(account, stream string) error { 268 js, cc := s.getJetStreamCluster() 269 if js == nil { 270 return NewJSNotEnabledError() 271 } 272 if cc == nil { 273 return NewJSClusterNotActiveError() 274 } 275 // Grab account 276 acc, err := s.LookupAccount(account) 277 if err != nil { 278 return err 279 } 280 // Grab stream 281 mset, err := acc.lookupStream(stream) 282 if err != nil { 283 return err 284 } 285 286 if node := mset.raftNode(); node != nil && node.Leader() { 287 node.StepDown() 288 } 289 290 return nil 291 } 292 293 func (s *Server) JetStreamStepdownConsumer(account, stream, consumer string) error { 294 js, cc := s.getJetStreamCluster() 295 if js == nil { 296 return NewJSNotEnabledError() 297 } 298 if cc == nil { 299 return NewJSClusterNotActiveError() 300 } 301 // Grab account 302 acc, err := s.LookupAccount(account) 303 if err != nil { 304 return err 305 } 306 // Grab stream 307 mset, err := acc.lookupStream(stream) 308 if err != nil { 309 return err 310 } 311 312 o := mset.lookupConsumer(consumer) 313 if o == nil { 314 return NewJSConsumerNotFoundError() 315 } 316 317 if node := o.raftNode(); node != nil && node.Leader() { 318 node.StepDown() 319 } 320 321 return nil 322 } 323 324 func (s *Server) JetStreamSnapshotStream(account, stream string) error { 325 js, cc := s.getJetStreamCluster() 326 if js == nil { 327 return NewJSNotEnabledForAccountError() 328 } 329 if cc == nil { 330 return NewJSClusterNotActiveError() 331 } 332 // Grab account 333 acc, err := s.LookupAccount(account) 334 if err != nil { 335 return err 336 } 337 // Grab stream 338 mset, err := acc.lookupStream(stream) 339 if err != nil { 340 return err 341 } 342 343 // Hold lock when installing snapshot. 344 mset.mu.Lock() 345 if mset.node == nil { 346 mset.mu.Unlock() 347 return nil 348 } 349 err = mset.node.InstallSnapshot(mset.stateSnapshotLocked()) 350 mset.mu.Unlock() 351 352 return err 353 } 354 355 func (s *Server) JetStreamClusterPeers() []string { 356 js := s.getJetStream() 357 if js == nil { 358 return nil 359 } 360 js.mu.RLock() 361 defer js.mu.RUnlock() 362 363 cc := js.cluster 364 if !cc.isLeader() || cc.meta == nil { 365 return nil 366 } 367 peers := cc.meta.Peers() 368 var nodes []string 369 for _, p := range peers { 370 si, ok := s.nodeToInfo.Load(p.ID) 371 if !ok || si == nil { 372 continue 373 } 374 ni := si.(nodeInfo) 375 // Ignore if offline, no JS, or no current stats have been received. 376 if ni.offline || !ni.js || ni.stats == nil { 377 continue 378 } 379 nodes = append(nodes, si.(nodeInfo).name) 380 } 381 return nodes 382 } 383 384 // Read lock should be held. 385 func (cc *jetStreamCluster) isLeader() bool { 386 if cc == nil { 387 // Non-clustered mode 388 return true 389 } 390 return cc.meta != nil && cc.meta.Leader() 391 } 392 393 // isStreamCurrent will determine if the stream is up to date. 394 // For R1 it will make sure the stream is present on this server. 395 // Read lock should be held. 396 func (cc *jetStreamCluster) isStreamCurrent(account, stream string) bool { 397 if cc == nil { 398 // Non-clustered mode 399 return true 400 } 401 as := cc.streams[account] 402 if as == nil { 403 return false 404 } 405 sa := as[stream] 406 if sa == nil { 407 return false 408 } 409 rg := sa.Group 410 if rg == nil { 411 return false 412 } 413 414 if rg.node == nil || rg.node.Current() { 415 // Check if we are processing a snapshot and are catching up. 416 acc, err := cc.s.LookupAccount(account) 417 if err != nil { 418 return false 419 } 420 mset, err := acc.lookupStream(stream) 421 if err != nil { 422 return false 423 } 424 if mset.isCatchingUp() { 425 return false 426 } 427 // Success. 428 return true 429 } 430 431 return false 432 } 433 434 // Restart the stream in question. 435 // Should only be called when the stream is known to be in a bad state. 436 func (js *jetStream) restartStream(acc *Account, csa *streamAssignment) { 437 js.mu.Lock() 438 s, cc := js.srv, js.cluster 439 if cc == nil { 440 js.mu.Unlock() 441 return 442 } 443 // Need to lookup the one directly from the meta layer, what we get handed is a copy if coming from isStreamHealthy. 444 asa := cc.streams[acc.Name] 445 if asa == nil { 446 js.mu.Unlock() 447 return 448 } 449 sa := asa[csa.Config.Name] 450 if sa == nil { 451 js.mu.Unlock() 452 return 453 } 454 // Make sure to clear out the raft node if still present in the meta layer. 455 if rg := sa.Group; rg != nil && rg.node != nil { 456 if rg.node.State() != Closed { 457 rg.node.Stop() 458 } 459 rg.node = nil 460 } 461 sinceCreation := time.Since(sa.Created) 462 js.mu.Unlock() 463 464 // Process stream assignment to recreate. 465 // Check that we have given system enough time to start us up. 466 // This will be longer than obvious, and matches consumer logic in case system very busy. 467 if sinceCreation < 10*time.Second { 468 s.Debugf("Not restarting missing stream '%s > %s', too soon since creation %v", 469 acc, csa.Config.Name, sinceCreation) 470 return 471 } 472 473 js.processStreamAssignment(sa) 474 475 // If we had consumers assigned to this server they will be present in the copy, csa. 476 // They also need to be processed. The csa consumers is a copy of only our consumers, 477 // those assigned to us, but the consumer assignment's there are direct from the meta 478 // layer to make this part much easier and avoid excessive lookups. 479 for _, cca := range csa.consumers { 480 if cca.deleted { 481 continue 482 } 483 // Need to look up original as well here to make sure node is nil. 484 js.mu.Lock() 485 ca := sa.consumers[cca.Name] 486 if ca != nil && ca.Group != nil { 487 // Make sure the node is stopped if still running. 488 if node := ca.Group.node; node != nil && node.State() != Closed { 489 node.Stop() 490 } 491 // Make sure node is wiped. 492 ca.Group.node = nil 493 } 494 js.mu.Unlock() 495 if ca != nil { 496 js.processConsumerAssignment(ca) 497 } 498 } 499 } 500 501 // isStreamHealthy will determine if the stream is up to date or very close. 502 // For R1 it will make sure the stream is present on this server. 503 func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool { 504 js.mu.RLock() 505 s, cc := js.srv, js.cluster 506 if cc == nil { 507 // Non-clustered mode 508 js.mu.RUnlock() 509 return true 510 } 511 512 // Pull the group out. 513 rg := sa.Group 514 if rg == nil { 515 js.mu.RUnlock() 516 return false 517 } 518 519 streamName := sa.Config.Name 520 node := rg.node 521 js.mu.RUnlock() 522 523 // First lookup stream and make sure its there. 524 mset, err := acc.lookupStream(streamName) 525 if err != nil { 526 js.restartStream(acc, sa) 527 return false 528 } 529 530 // If we are catching up return false. 531 if mset.isCatchingUp() { 532 return false 533 } 534 535 if node == nil || node.Healthy() { 536 // Check if we are processing a snapshot and are catching up. 537 if !mset.isCatchingUp() { 538 return true 539 } 540 } else if node != nil { 541 if node != mset.raftNode() { 542 s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName) 543 node.Delete() 544 mset.resetClusteredState(nil) 545 } else if node.State() == Closed { 546 js.restartStream(acc, sa) 547 } 548 } 549 550 return false 551 } 552 553 // isConsumerCurrent will determine if the consumer is up to date. 554 // For R1 it will make sure the consunmer is present on this server. 555 func (js *jetStream) isConsumerHealthy(mset *stream, consumer string, ca *consumerAssignment) bool { 556 if mset == nil { 557 return false 558 } 559 560 js.mu.RLock() 561 cc := js.cluster 562 if cc == nil { 563 // Non-clustered mode 564 js.mu.RUnlock() 565 return true 566 } 567 // These are required. 568 if ca == nil || ca.Group == nil { 569 js.mu.RUnlock() 570 return false 571 } 572 s := js.srv 573 js.mu.RUnlock() 574 575 // Capture RAFT node from assignment. 576 node := ca.Group.node 577 578 // When we try to restart we nil out the node if applicable 579 // and reprocess the consumer assignment. 580 restartConsumer := func() { 581 mset.mu.RLock() 582 accName, streamName := mset.acc.GetName(), mset.cfg.Name 583 mset.mu.RUnlock() 584 585 js.mu.Lock() 586 deleted := ca.deleted 587 // Check that we have not just been created. 588 if !deleted && time.Since(ca.Created) < 10*time.Second { 589 s.Debugf("Not restarting missing consumer '%s > %s > %s', too soon since creation %v", 590 accName, streamName, consumer, time.Since(ca.Created)) 591 js.mu.Unlock() 592 return 593 } 594 // Make sure the node is stopped if still running. 595 if node != nil && node.State() != Closed { 596 node.Stop() 597 } 598 ca.Group.node = nil 599 js.mu.Unlock() 600 if !deleted { 601 js.processConsumerAssignment(ca) 602 } 603 } 604 605 // Check if not running at all. 606 o := mset.lookupConsumer(consumer) 607 if o == nil { 608 restartConsumer() 609 return false 610 } 611 612 // Check RAFT node state. 613 if node == nil || node.Healthy() { 614 return true 615 } else if node != nil { 616 if node != o.raftNode() { 617 mset.mu.RLock() 618 accName, streamName := mset.acc.GetName(), mset.cfg.Name 619 mset.mu.RUnlock() 620 s.Warnf("Detected consumer cluster node skew '%s > %s > %s'", accName, streamName, consumer) 621 node.Delete() 622 o.deleteWithoutAdvisory() 623 restartConsumer() 624 } else if node.State() == Closed { 625 // We have a consumer, and it should have a running node but it is closed. 626 o.stop() 627 restartConsumer() 628 } 629 } 630 return false 631 } 632 633 // subjectsOverlap checks all existing stream assignments for the account cross-cluster for subject overlap 634 // Use only for clustered JetStream 635 // Read lock should be held. 636 func (jsc *jetStreamCluster) subjectsOverlap(acc string, subjects []string, osa *streamAssignment) bool { 637 asa := jsc.streams[acc] 638 for _, sa := range asa { 639 // can't overlap yourself, assume osa pre-checked for deep equal if passed 640 if osa != nil && sa == osa { 641 continue 642 } 643 for _, subj := range sa.Config.Subjects { 644 for _, tsubj := range subjects { 645 if SubjectsCollide(tsubj, subj) { 646 return true 647 } 648 } 649 } 650 } 651 return false 652 } 653 654 func (a *Account) getJetStreamFromAccount() (*Server, *jetStream, *jsAccount) { 655 a.mu.RLock() 656 jsa := a.js 657 a.mu.RUnlock() 658 if jsa == nil { 659 return nil, nil, nil 660 } 661 jsa.mu.RLock() 662 js := jsa.js 663 jsa.mu.RUnlock() 664 if js == nil { 665 return nil, nil, nil 666 } 667 // Lock not needed, set on creation. 668 s := js.srv 669 return s, js, jsa 670 } 671 672 func (s *Server) JetStreamIsStreamLeader(account, stream string) bool { 673 js, cc := s.getJetStreamCluster() 674 if js == nil || cc == nil { 675 return false 676 } 677 js.mu.RLock() 678 defer js.mu.RUnlock() 679 return cc.isStreamLeader(account, stream) 680 } 681 682 func (a *Account) JetStreamIsStreamLeader(stream string) bool { 683 s, js, jsa := a.getJetStreamFromAccount() 684 if s == nil || js == nil || jsa == nil { 685 return false 686 } 687 js.mu.RLock() 688 defer js.mu.RUnlock() 689 return js.cluster.isStreamLeader(a.Name, stream) 690 } 691 692 func (s *Server) JetStreamIsStreamCurrent(account, stream string) bool { 693 js, cc := s.getJetStreamCluster() 694 if js == nil { 695 return false 696 } 697 js.mu.RLock() 698 defer js.mu.RUnlock() 699 return cc.isStreamCurrent(account, stream) 700 } 701 702 func (a *Account) JetStreamIsConsumerLeader(stream, consumer string) bool { 703 s, js, jsa := a.getJetStreamFromAccount() 704 if s == nil || js == nil || jsa == nil { 705 return false 706 } 707 js.mu.RLock() 708 defer js.mu.RUnlock() 709 return js.cluster.isConsumerLeader(a.Name, stream, consumer) 710 } 711 712 func (s *Server) JetStreamIsConsumerLeader(account, stream, consumer string) bool { 713 js, cc := s.getJetStreamCluster() 714 if js == nil || cc == nil { 715 return false 716 } 717 js.mu.RLock() 718 defer js.mu.RUnlock() 719 return cc.isConsumerLeader(account, stream, consumer) 720 } 721 722 func (s *Server) enableJetStreamClustering() error { 723 if !s.isRunning() { 724 return nil 725 } 726 js := s.getJetStream() 727 if js == nil { 728 return NewJSNotEnabledForAccountError() 729 } 730 // Already set. 731 if js.cluster != nil { 732 return nil 733 } 734 735 s.Noticef("Starting JetStream cluster") 736 // We need to determine if we have a stable cluster name and expected number of servers. 737 s.Debugf("JetStream cluster checking for stable cluster name and peers") 738 739 hasLeafNodeSystemShare := s.canExtendOtherDomain() 740 if s.isClusterNameDynamic() && !hasLeafNodeSystemShare { 741 return errors.New("JetStream cluster requires cluster name") 742 } 743 return js.setupMetaGroup() 744 } 745 746 // isClustered returns if we are clustered. 747 // Lock should not be held. 748 func (js *jetStream) isClustered() bool { 749 // This is only ever set, no need for lock here. 750 return js.cluster != nil 751 } 752 753 // isClusteredNoLock returns if we are clustered, but unlike isClustered() does 754 // not use the jetstream's lock, instead, uses an atomic operation. 755 // There are situations where some code wants to know if we are clustered but 756 // can't use js.isClustered() without causing a lock inversion. 757 func (js *jetStream) isClusteredNoLock() bool { 758 return atomic.LoadInt32(&js.clustered) == 1 759 } 760 761 func (js *jetStream) setupMetaGroup() error { 762 s := js.srv 763 s.Noticef("Creating JetStream metadata controller") 764 765 // Setup our WAL for the metagroup. 766 sysAcc := s.SystemAccount() 767 storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, defaultMetaGroupName) 768 769 fs, err := newFileStoreWithCreated( 770 FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMetaFSBlkSize, AsyncFlush: false, srv: s}, 771 StreamConfig{Name: defaultMetaGroupName, Storage: FileStorage}, 772 time.Now().UTC(), 773 s.jsKeyGen(s.getOpts().JetStreamKey, defaultMetaGroupName), 774 s.jsKeyGen(s.getOpts().JetStreamOldKey, defaultMetaGroupName), 775 ) 776 if err != nil { 777 s.Errorf("Error creating filestore: %v", err) 778 return err 779 } 780 781 cfg := &RaftConfig{Name: defaultMetaGroupName, Store: storeDir, Log: fs} 782 783 // If we are soliciting leafnode connections and we are sharing a system account and do not disable it with a hint, 784 // we want to move to observer mode so that we extend the solicited cluster or supercluster but do not form our own. 785 cfg.Observer = s.canExtendOtherDomain() && s.getOpts().JetStreamExtHint != jsNoExtend 786 787 var bootstrap bool 788 if ps, err := readPeerState(storeDir); err != nil { 789 s.Noticef("JetStream cluster bootstrapping") 790 bootstrap = true 791 peers := s.ActivePeers() 792 s.Debugf("JetStream cluster initial peers: %+v", peers) 793 if err := s.bootstrapRaftNode(cfg, peers, false); err != nil { 794 return err 795 } 796 if cfg.Observer { 797 s.Noticef("Turning JetStream metadata controller Observer Mode on") 798 } 799 } else { 800 s.Noticef("JetStream cluster recovering state") 801 // correlate the value of observer with observations from a previous run. 802 if cfg.Observer { 803 switch ps.domainExt { 804 case extExtended: 805 s.Noticef("Keeping JetStream metadata controller Observer Mode on - due to previous contact") 806 case extNotExtended: 807 s.Noticef("Turning JetStream metadata controller Observer Mode off - due to previous contact") 808 cfg.Observer = false 809 case extUndetermined: 810 s.Noticef("Turning JetStream metadata controller Observer Mode on - no previous contact") 811 s.Noticef("In cases where JetStream will not be extended") 812 s.Noticef("and waiting for leader election until first contact is not acceptable,") 813 s.Noticef(`manually disable Observer Mode by setting the JetStream Option "extension_hint: %s"`, jsNoExtend) 814 } 815 } else { 816 // To track possible configuration changes, responsible for an altered value of cfg.Observer, 817 // set extension state to undetermined. 818 ps.domainExt = extUndetermined 819 if err := writePeerState(storeDir, ps); err != nil { 820 return err 821 } 822 } 823 } 824 825 // Start up our meta node. 826 n, err := s.startRaftNode(sysAcc.GetName(), cfg, pprofLabels{ 827 "type": "metaleader", 828 "account": sysAcc.Name, 829 }) 830 if err != nil { 831 s.Warnf("Could not start metadata controller: %v", err) 832 return err 833 } 834 835 // If we are bootstrapped with no state, start campaign early. 836 if bootstrap { 837 n.Campaign() 838 } 839 840 c := s.createInternalJetStreamClient() 841 sacc := s.SystemAccount() 842 843 js.mu.Lock() 844 defer js.mu.Unlock() 845 js.cluster = &jetStreamCluster{ 846 meta: n, 847 streams: make(map[string]map[string]*streamAssignment), 848 s: s, 849 c: c, 850 qch: make(chan struct{}), 851 } 852 atomic.StoreInt32(&js.clustered, 1) 853 c.registerWithAccount(sacc) 854 855 js.srv.startGoRoutine( 856 js.monitorCluster, 857 pprofLabels{ 858 "type": "metaleader", 859 "account": sacc.Name, 860 }, 861 ) 862 return nil 863 } 864 865 func (js *jetStream) getMetaGroup() RaftNode { 866 js.mu.RLock() 867 defer js.mu.RUnlock() 868 if js.cluster == nil { 869 return nil 870 } 871 return js.cluster.meta 872 } 873 874 func (js *jetStream) server() *Server { 875 // Lock not needed, only set once on creation. 876 return js.srv 877 } 878 879 // Will respond if we do not think we have a metacontroller leader. 880 func (js *jetStream) isLeaderless() bool { 881 js.mu.RLock() 882 defer js.mu.RUnlock() 883 884 cc := js.cluster 885 if cc == nil || cc.meta == nil { 886 return false 887 } 888 // If we don't have a leader. 889 // Make sure we have been running for enough time. 890 if cc.meta.GroupLeader() == _EMPTY_ && time.Since(cc.meta.Created()) > lostQuorumIntervalDefault { 891 return true 892 } 893 return false 894 } 895 896 // Will respond iff we are a member and we know we have no leader. 897 func (js *jetStream) isGroupLeaderless(rg *raftGroup) bool { 898 if rg == nil || js == nil { 899 return false 900 } 901 js.mu.RLock() 902 defer js.mu.RUnlock() 903 904 cc := js.cluster 905 906 // If we are not a member we can not say.. 907 if cc.meta == nil { 908 return false 909 } 910 if !rg.isMember(cc.meta.ID()) { 911 return false 912 } 913 // Single peer groups always have a leader if we are here. 914 if rg.node == nil { 915 return false 916 } 917 // If we don't have a leader. 918 if rg.node.GroupLeader() == _EMPTY_ { 919 // Threshold for jetstream startup. 920 const startupThreshold = 10 * time.Second 921 922 if rg.node.HadPreviousLeader() { 923 // Make sure we have been running long enough to intelligently determine this. 924 if time.Since(js.started) > startupThreshold { 925 return true 926 } 927 } 928 // Make sure we have been running for enough time. 929 if time.Since(rg.node.Created()) > lostQuorumIntervalDefault { 930 return true 931 } 932 } 933 934 return false 935 } 936 937 func (s *Server) JetStreamIsStreamAssigned(account, stream string) bool { 938 js, cc := s.getJetStreamCluster() 939 if js == nil || cc == nil { 940 return false 941 } 942 acc, _ := s.LookupAccount(account) 943 if acc == nil { 944 return false 945 } 946 js.mu.RLock() 947 assigned := cc.isStreamAssigned(acc, stream) 948 js.mu.RUnlock() 949 return assigned 950 } 951 952 // streamAssigned informs us if this server has this stream assigned. 953 func (jsa *jsAccount) streamAssigned(stream string) bool { 954 jsa.mu.RLock() 955 js, acc := jsa.js, jsa.account 956 jsa.mu.RUnlock() 957 958 if js == nil { 959 return false 960 } 961 js.mu.RLock() 962 assigned := js.cluster.isStreamAssigned(acc, stream) 963 js.mu.RUnlock() 964 return assigned 965 } 966 967 // Read lock should be held. 968 func (cc *jetStreamCluster) isStreamAssigned(a *Account, stream string) bool { 969 // Non-clustered mode always return true. 970 if cc == nil { 971 return true 972 } 973 if cc.meta == nil { 974 return false 975 } 976 as := cc.streams[a.Name] 977 if as == nil { 978 return false 979 } 980 sa := as[stream] 981 if sa == nil { 982 return false 983 } 984 rg := sa.Group 985 if rg == nil { 986 return false 987 } 988 // Check if we are the leader of this raftGroup assigned to the stream. 989 ourID := cc.meta.ID() 990 for _, peer := range rg.Peers { 991 if peer == ourID { 992 return true 993 } 994 } 995 return false 996 } 997 998 // Read lock should be held. 999 func (cc *jetStreamCluster) isStreamLeader(account, stream string) bool { 1000 // Non-clustered mode always return true. 1001 if cc == nil { 1002 return true 1003 } 1004 if cc.meta == nil { 1005 return false 1006 } 1007 1008 var sa *streamAssignment 1009 if as := cc.streams[account]; as != nil { 1010 sa = as[stream] 1011 } 1012 if sa == nil { 1013 return false 1014 } 1015 rg := sa.Group 1016 if rg == nil { 1017 return false 1018 } 1019 // Check if we are the leader of this raftGroup assigned to the stream. 1020 ourID := cc.meta.ID() 1021 for _, peer := range rg.Peers { 1022 if peer == ourID { 1023 if len(rg.Peers) == 1 || rg.node != nil && rg.node.Leader() { 1024 return true 1025 } 1026 } 1027 } 1028 return false 1029 } 1030 1031 // Read lock should be held. 1032 func (cc *jetStreamCluster) isConsumerLeader(account, stream, consumer string) bool { 1033 // Non-clustered mode always return true. 1034 if cc == nil { 1035 return true 1036 } 1037 if cc.meta == nil { 1038 return false 1039 } 1040 1041 var sa *streamAssignment 1042 if as := cc.streams[account]; as != nil { 1043 sa = as[stream] 1044 } 1045 if sa == nil { 1046 return false 1047 } 1048 // Check if we are the leader of this raftGroup assigned to this consumer. 1049 ca := sa.consumers[consumer] 1050 if ca == nil { 1051 return false 1052 } 1053 rg := ca.Group 1054 ourID := cc.meta.ID() 1055 for _, peer := range rg.Peers { 1056 if peer == ourID { 1057 if len(rg.Peers) == 1 || (rg.node != nil && rg.node.Leader()) { 1058 return true 1059 } 1060 } 1061 } 1062 return false 1063 } 1064 1065 // Remove the stream `streamName` for the account `accName` from the inflight 1066 // proposals map. This is done on success (processStreamAssignment) or on 1067 // failure (processStreamAssignmentResults). 1068 // (Write) Lock held on entry. 1069 func (cc *jetStreamCluster) removeInflightProposal(accName, streamName string) { 1070 streams, ok := cc.inflight[accName] 1071 if !ok { 1072 return 1073 } 1074 delete(streams, streamName) 1075 if len(streams) == 0 { 1076 delete(cc.inflight, accName) 1077 } 1078 } 1079 1080 // Return the cluster quit chan. 1081 func (js *jetStream) clusterQuitC() chan struct{} { 1082 js.mu.RLock() 1083 defer js.mu.RUnlock() 1084 if js.cluster != nil { 1085 return js.cluster.qch 1086 } 1087 return nil 1088 } 1089 1090 // Mark that the meta layer is recovering. 1091 func (js *jetStream) setMetaRecovering() { 1092 js.mu.Lock() 1093 defer js.mu.Unlock() 1094 if js.cluster != nil { 1095 // metaRecovering 1096 js.metaRecovering = true 1097 } 1098 } 1099 1100 // Mark that the meta layer is no longer recovering. 1101 func (js *jetStream) clearMetaRecovering() { 1102 js.mu.Lock() 1103 defer js.mu.Unlock() 1104 js.metaRecovering = false 1105 } 1106 1107 // Return whether the meta layer is recovering. 1108 func (js *jetStream) isMetaRecovering() bool { 1109 js.mu.RLock() 1110 defer js.mu.RUnlock() 1111 return js.metaRecovering 1112 } 1113 1114 // During recovery track any stream and consumer delete and update operations. 1115 type recoveryUpdates struct { 1116 removeStreams map[string]*streamAssignment 1117 removeConsumers map[string]*consumerAssignment 1118 updateStreams map[string]*streamAssignment 1119 updateConsumers map[string]*consumerAssignment 1120 } 1121 1122 // Called after recovery of the cluster on startup to check for any orphans. 1123 // Streams and consumers are recovered from disk, and the meta layer's mappings 1124 // should clean them up, but under crash scenarios there could be orphans. 1125 func (js *jetStream) checkForOrphans() { 1126 consumerName := func(o *consumer) string { 1127 o.mu.RLock() 1128 defer o.mu.RUnlock() 1129 return o.name 1130 } 1131 1132 // Can not hold jetstream lock while trying to delete streams or consumers. 1133 js.mu.Lock() 1134 s, cc := js.srv, js.cluster 1135 s.Debugf("JetStream cluster checking for orphans") 1136 1137 var streams []*stream 1138 var consumers []*consumer 1139 1140 for accName, jsa := range js.accounts { 1141 asa := cc.streams[accName] 1142 jsa.mu.RLock() 1143 for stream, mset := range jsa.streams { 1144 if sa := asa[stream]; sa == nil { 1145 streams = append(streams, mset) 1146 } else { 1147 // This one is good, check consumers now. 1148 for _, o := range mset.getConsumers() { 1149 consumer := consumerName(o) 1150 if sa.consumers[consumer] == nil { 1151 consumers = append(consumers, o) 1152 } 1153 } 1154 } 1155 } 1156 jsa.mu.RUnlock() 1157 } 1158 js.mu.Unlock() 1159 1160 for _, mset := range streams { 1161 mset.mu.RLock() 1162 accName, stream := mset.acc.Name, mset.cfg.Name 1163 mset.mu.RUnlock() 1164 s.Warnf("Detected orphaned stream '%s > %s', will cleanup", accName, stream) 1165 if err := mset.delete(); err != nil { 1166 s.Warnf("Deleting stream encountered an error: %v", err) 1167 } 1168 } 1169 for _, o := range consumers { 1170 o.mu.RLock() 1171 accName, mset, consumer := o.acc.Name, o.mset, o.name 1172 o.mu.RUnlock() 1173 stream := "N/A" 1174 if mset != nil { 1175 mset.mu.RLock() 1176 stream = mset.cfg.Name 1177 mset.mu.RUnlock() 1178 } 1179 s.Warnf("Detected orphaned consumer '%s > %s > %s', will cleanup", accName, stream, consumer) 1180 if err := o.delete(); err != nil { 1181 s.Warnf("Deleting consumer encountered an error: %v", err) 1182 } 1183 } 1184 } 1185 1186 // Check and delete any orphans we may come across. 1187 func (s *Server) checkForNRGOrphans() { 1188 js, cc := s.getJetStreamCluster() 1189 if js == nil || cc == nil || js.isMetaRecovering() { 1190 // No cluster means no NRGs. Also return if still recovering. 1191 return 1192 } 1193 1194 // Track which assets R>1 should be on this server. 1195 nrgMap := make(map[string]struct{}) 1196 trackGroup := func(rg *raftGroup) { 1197 // If R>1 track this as a legit NRG. 1198 if rg.node != nil { 1199 nrgMap[rg.Name] = struct{}{} 1200 } 1201 } 1202 // Register our meta. 1203 js.mu.RLock() 1204 meta := cc.meta 1205 if meta == nil { 1206 js.mu.RUnlock() 1207 // Bail with no meta node. 1208 return 1209 } 1210 1211 ourID := meta.ID() 1212 nrgMap[meta.Group()] = struct{}{} 1213 1214 // Collect all valid groups from our assignments. 1215 for _, asa := range cc.streams { 1216 for _, sa := range asa { 1217 if sa.Group.isMember(ourID) && sa.Restore == nil { 1218 trackGroup(sa.Group) 1219 for _, ca := range sa.consumers { 1220 if ca.Group.isMember(ourID) { 1221 trackGroup(ca.Group) 1222 } 1223 } 1224 } 1225 } 1226 } 1227 js.mu.RUnlock() 1228 1229 // Check NRGs that are running. 1230 var needDelete []RaftNode 1231 s.rnMu.RLock() 1232 for name, n := range s.raftNodes { 1233 if _, ok := nrgMap[name]; !ok { 1234 needDelete = append(needDelete, n) 1235 } 1236 } 1237 s.rnMu.RUnlock() 1238 1239 for _, n := range needDelete { 1240 s.Warnf("Detected orphaned NRG %q, will cleanup", n.Group()) 1241 n.Delete() 1242 } 1243 } 1244 1245 func (js *jetStream) monitorCluster() { 1246 s, n := js.server(), js.getMetaGroup() 1247 qch, rqch, lch, aq := js.clusterQuitC(), n.QuitC(), n.LeadChangeC(), n.ApplyQ() 1248 1249 defer s.grWG.Done() 1250 1251 s.Debugf("Starting metadata monitor") 1252 defer s.Debugf("Exiting metadata monitor") 1253 1254 // Make sure to stop the raft group on exit to prevent accidental memory bloat. 1255 defer n.Stop() 1256 defer s.isMetaLeader.Store(false) 1257 1258 const compactInterval = time.Minute 1259 t := time.NewTicker(compactInterval) 1260 defer t.Stop() 1261 1262 // Used to check cold boot cluster when possibly in mixed mode. 1263 const leaderCheckInterval = time.Second 1264 lt := time.NewTicker(leaderCheckInterval) 1265 defer lt.Stop() 1266 1267 // Check the general health once an hour. 1268 const healthCheckInterval = 1 * time.Hour 1269 ht := time.NewTicker(healthCheckInterval) 1270 defer ht.Stop() 1271 1272 // Utility to check health. 1273 checkHealth := func() { 1274 if hs := s.healthz(nil); hs.Error != _EMPTY_ { 1275 s.Warnf("%v", hs.Error) 1276 } 1277 // Also check for orphaned NRGs. 1278 s.checkForNRGOrphans() 1279 } 1280 1281 var ( 1282 isLeader bool 1283 lastSnapTime time.Time 1284 compactSizeMin = uint64(8 * 1024 * 1024) // 8MB 1285 minSnapDelta = 10 * time.Second 1286 ) 1287 1288 // Highwayhash key for generating hashes. 1289 key := make([]byte, 32) 1290 crand.Read(key) 1291 1292 // Set to true to start. 1293 js.setMetaRecovering() 1294 1295 // Snapshotting function. 1296 doSnapshot := func() { 1297 // Suppress during recovery. 1298 if js.isMetaRecovering() { 1299 return 1300 } 1301 // For the meta layer we want to snapshot when asked if we need one or have any entries that we can compact. 1302 if ne, _ := n.Size(); ne > 0 || n.NeedSnapshot() { 1303 if err := n.InstallSnapshot(js.metaSnapshot()); err == nil { 1304 lastSnapTime = time.Now() 1305 } else if err != errNoSnapAvailable && err != errNodeClosed { 1306 s.Warnf("Error snapshotting JetStream cluster state: %v", err) 1307 } 1308 } 1309 } 1310 1311 ru := &recoveryUpdates{ 1312 removeStreams: make(map[string]*streamAssignment), 1313 removeConsumers: make(map[string]*consumerAssignment), 1314 updateStreams: make(map[string]*streamAssignment), 1315 updateConsumers: make(map[string]*consumerAssignment), 1316 } 1317 1318 for { 1319 select { 1320 case <-s.quitCh: 1321 return 1322 case <-rqch: 1323 return 1324 case <-qch: 1325 // Clean signal from shutdown routine so do best effort attempt to snapshot meta layer. 1326 doSnapshot() 1327 // Return the signal back since shutdown will be waiting. 1328 close(qch) 1329 return 1330 case <-aq.ch: 1331 ces := aq.pop() 1332 for _, ce := range ces { 1333 if ce == nil { 1334 // Signals we have replayed all of our metadata. 1335 js.clearMetaRecovering() 1336 // Process any removes that are still valid after recovery. 1337 for _, ca := range ru.removeConsumers { 1338 js.processConsumerRemoval(ca) 1339 } 1340 for _, sa := range ru.removeStreams { 1341 js.processStreamRemoval(sa) 1342 } 1343 // Process pending updates. 1344 for _, sa := range ru.updateStreams { 1345 js.processUpdateStreamAssignment(sa) 1346 } 1347 // Now consumers. 1348 for _, ca := range ru.updateConsumers { 1349 js.processConsumerAssignment(ca) 1350 } 1351 // Clear. 1352 ru = nil 1353 s.Debugf("Recovered JetStream cluster metadata") 1354 js.checkForOrphans() 1355 // Do a health check here as well. 1356 go checkHealth() 1357 continue 1358 } 1359 if didSnap, didStreamRemoval, didConsumerRemoval, err := js.applyMetaEntries(ce.Entries, ru); err == nil { 1360 _, nb := n.Applied(ce.Index) 1361 if js.hasPeerEntries(ce.Entries) || didStreamRemoval || (didSnap && !isLeader) { 1362 doSnapshot() 1363 } else if didConsumerRemoval && time.Since(lastSnapTime) > minSnapDelta/2 { 1364 doSnapshot() 1365 } else if nb > compactSizeMin && time.Since(lastSnapTime) > minSnapDelta { 1366 doSnapshot() 1367 } 1368 ce.ReturnToPool() 1369 } else { 1370 s.Warnf("Error applying JetStream cluster entries: %v", err) 1371 } 1372 } 1373 aq.recycle(&ces) 1374 1375 case isLeader = <-lch: 1376 // For meta layer synchronize everyone to our state on becoming leader. 1377 if isLeader && n.ApplyQ().len() == 0 { 1378 n.SendSnapshot(js.metaSnapshot()) 1379 } 1380 // Process the change. 1381 js.processLeaderChange(isLeader) 1382 if isLeader { 1383 s.sendInternalMsgLocked(serverStatsPingReqSubj, _EMPTY_, nil, nil) 1384 // Install a snapshot as we become leader. 1385 js.checkClusterSize() 1386 doSnapshot() 1387 } 1388 1389 case <-t.C: 1390 doSnapshot() 1391 // Periodically check the cluster size. 1392 if n.Leader() { 1393 js.checkClusterSize() 1394 } 1395 case <-ht.C: 1396 // Do this in a separate go routine. 1397 go checkHealth() 1398 1399 case <-lt.C: 1400 s.Debugf("Checking JetStream cluster state") 1401 // If we have a current leader or had one in the past we can cancel this here since the metaleader 1402 // will be in charge of all peer state changes. 1403 // For cold boot only. 1404 if n.GroupLeader() != _EMPTY_ || n.HadPreviousLeader() { 1405 lt.Stop() 1406 continue 1407 } 1408 // If we are here we do not have a leader and we did not have a previous one, so cold start. 1409 // Check to see if we can adjust our cluster size down iff we are in mixed mode and we have 1410 // seen a total that is what our original estimate was. 1411 cs := n.ClusterSize() 1412 if js, total := s.trackedJetStreamServers(); js < total && total >= cs && js != cs { 1413 s.Noticef("Adjusting JetStream expected peer set size to %d from original %d", js, cs) 1414 n.AdjustBootClusterSize(js) 1415 } 1416 } 1417 } 1418 } 1419 1420 // This is called on first leader transition to double check the peers and cluster set size. 1421 func (js *jetStream) checkClusterSize() { 1422 s, n := js.server(), js.getMetaGroup() 1423 if n == nil { 1424 return 1425 } 1426 // We will check that we have a correct cluster set size by checking for any non-js servers 1427 // which can happen in mixed mode. 1428 ps := n.(*raft).currentPeerState() 1429 if len(ps.knownPeers) >= ps.clusterSize { 1430 return 1431 } 1432 1433 // Grab our active peers. 1434 peers := s.ActivePeers() 1435 1436 // If we have not registered all of our peers yet we can't do 1437 // any adjustments based on a mixed mode. We will periodically check back. 1438 if len(peers) < ps.clusterSize { 1439 return 1440 } 1441 1442 s.Debugf("Checking JetStream cluster size") 1443 1444 // If we are here our known set as the leader is not the same as the cluster size. 1445 // Check to see if we have a mixed mode setup. 1446 var totalJS int 1447 for _, p := range peers { 1448 if si, ok := s.nodeToInfo.Load(p); ok && si != nil { 1449 if si.(nodeInfo).js { 1450 totalJS++ 1451 } 1452 } 1453 } 1454 // If we have less then our cluster size adjust that here. Can not do individual peer removals since 1455 // they will not be in the tracked peers. 1456 if totalJS < ps.clusterSize { 1457 s.Debugf("Adjusting JetStream cluster size from %d to %d", ps.clusterSize, totalJS) 1458 if err := n.AdjustClusterSize(totalJS); err != nil { 1459 s.Warnf("Error adjusting JetStream cluster size: %v", err) 1460 } 1461 } 1462 } 1463 1464 // Represents our stable meta state that we can write out. 1465 type writeableStreamAssignment struct { 1466 Client *ClientInfo `json:"client,omitempty"` 1467 Created time.Time `json:"created"` 1468 Config *StreamConfig `json:"stream"` 1469 Group *raftGroup `json:"group"` 1470 Sync string `json:"sync"` 1471 Consumers []*consumerAssignment 1472 } 1473 1474 func (js *jetStream) clusterStreamConfig(accName, streamName string) (StreamConfig, bool) { 1475 js.mu.RLock() 1476 defer js.mu.RUnlock() 1477 if sa, ok := js.cluster.streams[accName][streamName]; ok { 1478 return *sa.Config, true 1479 } 1480 return StreamConfig{}, false 1481 } 1482 1483 func (js *jetStream) metaSnapshot() []byte { 1484 js.mu.RLock() 1485 cc := js.cluster 1486 nsa := 0 1487 for _, asa := range cc.streams { 1488 nsa += len(asa) 1489 } 1490 streams := make([]writeableStreamAssignment, 0, nsa) 1491 for _, asa := range cc.streams { 1492 for _, sa := range asa { 1493 wsa := writeableStreamAssignment{ 1494 Client: sa.Client, 1495 Created: sa.Created, 1496 Config: sa.Config, 1497 Group: sa.Group, 1498 Sync: sa.Sync, 1499 Consumers: make([]*consumerAssignment, 0, len(sa.consumers)), 1500 } 1501 for _, ca := range sa.consumers { 1502 wsa.Consumers = append(wsa.Consumers, ca) 1503 } 1504 streams = append(streams, wsa) 1505 } 1506 } 1507 1508 if len(streams) == 0 { 1509 js.mu.RUnlock() 1510 return nil 1511 } 1512 1513 b, _ := json.Marshal(streams) 1514 js.mu.RUnlock() 1515 1516 return s2.EncodeBetter(nil, b) 1517 } 1518 1519 func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecovering bool) error { 1520 var wsas []writeableStreamAssignment 1521 if len(buf) > 0 { 1522 jse, err := s2.Decode(nil, buf) 1523 if err != nil { 1524 return err 1525 } 1526 if err = json.Unmarshal(jse, &wsas); err != nil { 1527 return err 1528 } 1529 } 1530 1531 // Build our new version here outside of js. 1532 streams := make(map[string]map[string]*streamAssignment) 1533 for _, wsa := range wsas { 1534 fixCfgMirrorWithDedupWindow(wsa.Config) 1535 as := streams[wsa.Client.serviceAccount()] 1536 if as == nil { 1537 as = make(map[string]*streamAssignment) 1538 streams[wsa.Client.serviceAccount()] = as 1539 } 1540 sa := &streamAssignment{Client: wsa.Client, Created: wsa.Created, Config: wsa.Config, Group: wsa.Group, Sync: wsa.Sync} 1541 if len(wsa.Consumers) > 0 { 1542 sa.consumers = make(map[string]*consumerAssignment) 1543 for _, ca := range wsa.Consumers { 1544 sa.consumers[ca.Name] = ca 1545 } 1546 } 1547 as[wsa.Config.Name] = sa 1548 } 1549 1550 js.mu.Lock() 1551 cc := js.cluster 1552 1553 var saAdd, saDel, saChk []*streamAssignment 1554 // Walk through the old list to generate the delete list. 1555 for account, asa := range cc.streams { 1556 nasa := streams[account] 1557 for sn, sa := range asa { 1558 if nsa := nasa[sn]; nsa == nil { 1559 saDel = append(saDel, sa) 1560 } else { 1561 saChk = append(saChk, nsa) 1562 } 1563 } 1564 } 1565 // Walk through the new list to generate the add list. 1566 for account, nasa := range streams { 1567 asa := cc.streams[account] 1568 for sn, sa := range nasa { 1569 if asa[sn] == nil { 1570 saAdd = append(saAdd, sa) 1571 } 1572 } 1573 } 1574 // Now walk the ones to check and process consumers. 1575 var caAdd, caDel []*consumerAssignment 1576 for _, sa := range saChk { 1577 // Make sure to add in all the new ones from sa. 1578 for _, ca := range sa.consumers { 1579 caAdd = append(caAdd, ca) 1580 } 1581 if osa := js.streamAssignment(sa.Client.serviceAccount(), sa.Config.Name); osa != nil { 1582 for _, ca := range osa.consumers { 1583 if sa.consumers[ca.Name] == nil { 1584 caDel = append(caDel, ca) 1585 } else { 1586 caAdd = append(caAdd, ca) 1587 } 1588 } 1589 } 1590 } 1591 js.mu.Unlock() 1592 1593 // Do removals first. 1594 for _, sa := range saDel { 1595 js.setStreamAssignmentRecovering(sa) 1596 if isRecovering { 1597 key := sa.recoveryKey() 1598 ru.removeStreams[key] = sa 1599 delete(ru.updateStreams, key) 1600 } else { 1601 js.processStreamRemoval(sa) 1602 } 1603 } 1604 // Now do add for the streams. Also add in all consumers. 1605 for _, sa := range saAdd { 1606 js.setStreamAssignmentRecovering(sa) 1607 js.processStreamAssignment(sa) 1608 1609 // We can simply process the consumers. 1610 for _, ca := range sa.consumers { 1611 js.setConsumerAssignmentRecovering(ca) 1612 js.processConsumerAssignment(ca) 1613 } 1614 } 1615 1616 // Perform updates on those in saChk. These were existing so make 1617 // sure to process any changes. 1618 for _, sa := range saChk { 1619 js.setStreamAssignmentRecovering(sa) 1620 if isRecovering { 1621 key := sa.recoveryKey() 1622 ru.updateStreams[key] = sa 1623 delete(ru.removeStreams, key) 1624 } else { 1625 js.processUpdateStreamAssignment(sa) 1626 } 1627 } 1628 1629 // Now do the deltas for existing stream's consumers. 1630 for _, ca := range caDel { 1631 js.setConsumerAssignmentRecovering(ca) 1632 if isRecovering { 1633 key := ca.recoveryKey() 1634 ru.removeConsumers[key] = ca 1635 delete(ru.updateConsumers, key) 1636 } else { 1637 js.processConsumerRemoval(ca) 1638 } 1639 } 1640 for _, ca := range caAdd { 1641 js.setConsumerAssignmentRecovering(ca) 1642 if isRecovering { 1643 key := ca.recoveryKey() 1644 delete(ru.removeConsumers, key) 1645 ru.updateConsumers[key] = ca 1646 } else { 1647 js.processConsumerAssignment(ca) 1648 } 1649 } 1650 1651 return nil 1652 } 1653 1654 // Called on recovery to make sure we do not process like original. 1655 func (js *jetStream) setStreamAssignmentRecovering(sa *streamAssignment) { 1656 js.mu.Lock() 1657 defer js.mu.Unlock() 1658 sa.responded = true 1659 sa.recovering = true 1660 sa.Restore = nil 1661 if sa.Group != nil { 1662 sa.Group.Preferred = _EMPTY_ 1663 } 1664 } 1665 1666 // Called on recovery to make sure we do not process like original. 1667 func (js *jetStream) setConsumerAssignmentRecovering(ca *consumerAssignment) { 1668 js.mu.Lock() 1669 defer js.mu.Unlock() 1670 ca.responded = true 1671 ca.recovering = true 1672 if ca.Group != nil { 1673 ca.Group.Preferred = _EMPTY_ 1674 } 1675 } 1676 1677 // Just copies over and changes out the group so it can be encoded. 1678 // Lock should be held. 1679 func (sa *streamAssignment) copyGroup() *streamAssignment { 1680 csa, cg := *sa, *sa.Group 1681 csa.Group = &cg 1682 csa.Group.Peers = copyStrings(sa.Group.Peers) 1683 return &csa 1684 } 1685 1686 // Just copies over and changes out the group so it can be encoded. 1687 // Lock should be held. 1688 func (ca *consumerAssignment) copyGroup() *consumerAssignment { 1689 cca, cg := *ca, *ca.Group 1690 cca.Group = &cg 1691 cca.Group.Peers = copyStrings(ca.Group.Peers) 1692 return &cca 1693 } 1694 1695 // Lock should be held. 1696 func (sa *streamAssignment) missingPeers() bool { 1697 return len(sa.Group.Peers) < sa.Config.Replicas 1698 } 1699 1700 // Called when we detect a new peer. Only the leader will process checking 1701 // for any streams, and consequently any consumers. 1702 func (js *jetStream) processAddPeer(peer string) { 1703 js.mu.Lock() 1704 defer js.mu.Unlock() 1705 1706 s, cc := js.srv, js.cluster 1707 if cc == nil || cc.meta == nil { 1708 return 1709 } 1710 isLeader := cc.isLeader() 1711 1712 // Now check if we are meta-leader. We will check for any re-assignments. 1713 if !isLeader { 1714 return 1715 } 1716 1717 sir, ok := s.nodeToInfo.Load(peer) 1718 if !ok || sir == nil { 1719 return 1720 } 1721 si := sir.(nodeInfo) 1722 1723 for _, asa := range cc.streams { 1724 for _, sa := range asa { 1725 if sa.missingPeers() { 1726 // Make sure the right cluster etc. 1727 if si.cluster != sa.Client.Cluster { 1728 continue 1729 } 1730 // If we are here we can add in this peer. 1731 csa := sa.copyGroup() 1732 csa.Group.Peers = append(csa.Group.Peers, peer) 1733 // Send our proposal for this csa. Also use same group definition for all the consumers as well. 1734 cc.meta.Propose(encodeAddStreamAssignment(csa)) 1735 for _, ca := range sa.consumers { 1736 // Ephemerals are R=1, so only auto-remap durables, or R>1. 1737 if ca.Config.Durable != _EMPTY_ || len(ca.Group.Peers) > 1 { 1738 cca := ca.copyGroup() 1739 cca.Group.Peers = csa.Group.Peers 1740 cc.meta.Propose(encodeAddConsumerAssignment(cca)) 1741 } 1742 } 1743 } 1744 } 1745 } 1746 } 1747 1748 func (js *jetStream) processRemovePeer(peer string) { 1749 // We may be already disabled. 1750 if js == nil || js.disabled.Load() { 1751 return 1752 } 1753 1754 js.mu.Lock() 1755 s, cc := js.srv, js.cluster 1756 if cc == nil || cc.meta == nil { 1757 js.mu.Unlock() 1758 return 1759 } 1760 isLeader := cc.isLeader() 1761 // All nodes will check if this is them. 1762 isUs := cc.meta.ID() == peer 1763 js.mu.Unlock() 1764 1765 if isUs { 1766 s.Errorf("JetStream being DISABLED, our server was removed from the cluster") 1767 adv := &JSServerRemovedAdvisory{ 1768 TypedEvent: TypedEvent{ 1769 Type: JSServerRemovedAdvisoryType, 1770 ID: nuid.Next(), 1771 Time: time.Now().UTC(), 1772 }, 1773 Server: s.Name(), 1774 ServerID: s.ID(), 1775 Cluster: s.cachedClusterName(), 1776 Domain: s.getOpts().JetStreamDomain, 1777 } 1778 s.publishAdvisory(nil, JSAdvisoryServerRemoved, adv) 1779 1780 go s.DisableJetStream() 1781 } 1782 1783 // Now check if we are meta-leader. We will attempt re-assignment. 1784 if !isLeader { 1785 return 1786 } 1787 1788 js.mu.Lock() 1789 defer js.mu.Unlock() 1790 1791 for _, asa := range cc.streams { 1792 for _, sa := range asa { 1793 if rg := sa.Group; rg.isMember(peer) { 1794 js.removePeerFromStreamLocked(sa, peer) 1795 } 1796 } 1797 } 1798 } 1799 1800 // Assumes all checks have already been done. 1801 func (js *jetStream) removePeerFromStream(sa *streamAssignment, peer string) bool { 1802 js.mu.Lock() 1803 defer js.mu.Unlock() 1804 return js.removePeerFromStreamLocked(sa, peer) 1805 } 1806 1807 // Lock should be held. 1808 func (js *jetStream) removePeerFromStreamLocked(sa *streamAssignment, peer string) bool { 1809 if rg := sa.Group; !rg.isMember(peer) { 1810 return false 1811 } 1812 1813 s, cc, csa := js.srv, js.cluster, sa.copyGroup() 1814 if cc == nil || cc.meta == nil { 1815 return false 1816 } 1817 replaced := cc.remapStreamAssignment(csa, peer) 1818 if !replaced { 1819 s.Warnf("JetStream cluster could not replace peer for stream '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name) 1820 } 1821 1822 // Send our proposal for this csa. Also use same group definition for all the consumers as well. 1823 cc.meta.Propose(encodeAddStreamAssignment(csa)) 1824 rg := csa.Group 1825 for _, ca := range sa.consumers { 1826 // Ephemerals are R=1, so only auto-remap durables, or R>1. 1827 if ca.Config.Durable != _EMPTY_ { 1828 cca := ca.copyGroup() 1829 cca.Group.Peers, cca.Group.Preferred = rg.Peers, _EMPTY_ 1830 cc.meta.Propose(encodeAddConsumerAssignment(cca)) 1831 } else if ca.Group.isMember(peer) { 1832 // These are ephemerals. Check to see if we deleted this peer. 1833 cc.meta.Propose(encodeDeleteConsumerAssignment(ca)) 1834 } 1835 } 1836 return replaced 1837 } 1838 1839 // Check if we have peer related entries. 1840 func (js *jetStream) hasPeerEntries(entries []*Entry) bool { 1841 for _, e := range entries { 1842 if e.Type == EntryRemovePeer || e.Type == EntryAddPeer { 1843 return true 1844 } 1845 } 1846 return false 1847 } 1848 1849 const ksep = ":" 1850 1851 func (sa *streamAssignment) recoveryKey() string { 1852 if sa == nil { 1853 return _EMPTY_ 1854 } 1855 return sa.Client.serviceAccount() + ksep + sa.Config.Name 1856 } 1857 1858 func (ca *consumerAssignment) recoveryKey() string { 1859 if ca == nil { 1860 return _EMPTY_ 1861 } 1862 return ca.Client.serviceAccount() + ksep + ca.Stream + ksep + ca.Name 1863 } 1864 1865 func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bool, bool, bool, error) { 1866 var didSnap, didRemoveStream, didRemoveConsumer bool 1867 isRecovering := js.isMetaRecovering() 1868 1869 for _, e := range entries { 1870 if e.Type == EntrySnapshot { 1871 js.applyMetaSnapshot(e.Data, ru, isRecovering) 1872 didSnap = true 1873 } else if e.Type == EntryRemovePeer { 1874 if !isRecovering { 1875 js.processRemovePeer(string(e.Data)) 1876 } 1877 } else if e.Type == EntryAddPeer { 1878 if !isRecovering { 1879 js.processAddPeer(string(e.Data)) 1880 } 1881 } else { 1882 buf := e.Data 1883 switch entryOp(buf[0]) { 1884 case assignStreamOp: 1885 sa, err := decodeStreamAssignment(buf[1:]) 1886 if err != nil { 1887 js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:]) 1888 return didSnap, didRemoveStream, didRemoveConsumer, err 1889 } 1890 if isRecovering { 1891 js.setStreamAssignmentRecovering(sa) 1892 delete(ru.removeStreams, sa.recoveryKey()) 1893 } 1894 if js.processStreamAssignment(sa) { 1895 didRemoveStream = true 1896 } 1897 case removeStreamOp: 1898 sa, err := decodeStreamAssignment(buf[1:]) 1899 if err != nil { 1900 js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:]) 1901 return didSnap, didRemoveStream, didRemoveConsumer, err 1902 } 1903 if isRecovering { 1904 js.setStreamAssignmentRecovering(sa) 1905 key := sa.recoveryKey() 1906 ru.removeStreams[key] = sa 1907 delete(ru.updateStreams, key) 1908 } else { 1909 js.processStreamRemoval(sa) 1910 didRemoveStream = true 1911 } 1912 case assignConsumerOp: 1913 ca, err := decodeConsumerAssignment(buf[1:]) 1914 if err != nil { 1915 js.srv.Errorf("JetStream cluster failed to decode consumer assignment: %q", buf[1:]) 1916 return didSnap, didRemoveStream, didRemoveConsumer, err 1917 } 1918 if isRecovering { 1919 js.setConsumerAssignmentRecovering(ca) 1920 key := ca.recoveryKey() 1921 delete(ru.removeConsumers, key) 1922 ru.updateConsumers[key] = ca 1923 } else { 1924 js.processConsumerAssignment(ca) 1925 } 1926 case assignCompressedConsumerOp: 1927 ca, err := decodeConsumerAssignmentCompressed(buf[1:]) 1928 if err != nil { 1929 js.srv.Errorf("JetStream cluster failed to decode compressed consumer assignment: %q", buf[1:]) 1930 return didSnap, didRemoveStream, didRemoveConsumer, err 1931 } 1932 if isRecovering { 1933 js.setConsumerAssignmentRecovering(ca) 1934 key := ca.recoveryKey() 1935 delete(ru.removeConsumers, key) 1936 ru.updateConsumers[key] = ca 1937 } else { 1938 js.processConsumerAssignment(ca) 1939 } 1940 case removeConsumerOp: 1941 ca, err := decodeConsumerAssignment(buf[1:]) 1942 if err != nil { 1943 js.srv.Errorf("JetStream cluster failed to decode consumer assignment: %q", buf[1:]) 1944 return didSnap, didRemoveStream, didRemoveConsumer, err 1945 } 1946 if isRecovering { 1947 js.setConsumerAssignmentRecovering(ca) 1948 key := ca.recoveryKey() 1949 ru.removeConsumers[key] = ca 1950 delete(ru.updateConsumers, key) 1951 } else { 1952 js.processConsumerRemoval(ca) 1953 didRemoveConsumer = true 1954 } 1955 case updateStreamOp: 1956 sa, err := decodeStreamAssignment(buf[1:]) 1957 if err != nil { 1958 js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:]) 1959 return didSnap, didRemoveStream, didRemoveConsumer, err 1960 } 1961 if isRecovering { 1962 js.setStreamAssignmentRecovering(sa) 1963 key := sa.recoveryKey() 1964 ru.updateStreams[key] = sa 1965 delete(ru.removeStreams, key) 1966 } else { 1967 js.processUpdateStreamAssignment(sa) 1968 // Since an update can be lowering replica count, we want upper layer to treat 1969 // similar to a removal and snapshot to collapse old entries. 1970 didRemoveStream = true 1971 } 1972 default: 1973 panic(fmt.Sprintf("JetStream Cluster Unknown meta entry op type: %v", entryOp(buf[0]))) 1974 } 1975 } 1976 } 1977 return didSnap, didRemoveStream, didRemoveConsumer, nil 1978 } 1979 1980 func (rg *raftGroup) isMember(id string) bool { 1981 if rg == nil { 1982 return false 1983 } 1984 for _, peer := range rg.Peers { 1985 if peer == id { 1986 return true 1987 } 1988 } 1989 return false 1990 } 1991 1992 func (rg *raftGroup) setPreferred() { 1993 if rg == nil || len(rg.Peers) == 0 { 1994 return 1995 } 1996 if len(rg.Peers) == 1 { 1997 rg.Preferred = rg.Peers[0] 1998 } else { 1999 // For now just randomly select a peer for the preferred. 2000 pi := rand.Int31n(int32(len(rg.Peers))) 2001 rg.Preferred = rg.Peers[pi] 2002 } 2003 } 2004 2005 // createRaftGroup is called to spin up this raft group if needed. 2006 func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, storage StorageType, labels pprofLabels) error { 2007 js.mu.Lock() 2008 s, cc := js.srv, js.cluster 2009 if cc == nil || cc.meta == nil { 2010 js.mu.Unlock() 2011 return NewJSClusterNotActiveError() 2012 } 2013 2014 // If this is a single peer raft group or we are not a member return. 2015 if len(rg.Peers) <= 1 || !rg.isMember(cc.meta.ID()) { 2016 js.mu.Unlock() 2017 // Nothing to do here. 2018 return nil 2019 } 2020 2021 // Check if we already have this assigned. 2022 if node := s.lookupRaftNode(rg.Name); node != nil { 2023 s.Debugf("JetStream cluster already has raft group %q assigned", rg.Name) 2024 rg.node = node 2025 js.mu.Unlock() 2026 return nil 2027 } 2028 2029 s.Debugf("JetStream cluster creating raft group:%+v", rg) 2030 js.mu.Unlock() 2031 2032 sysAcc := s.SystemAccount() 2033 if sysAcc == nil { 2034 s.Debugf("JetStream cluster detected shutdown processing raft group: %+v", rg) 2035 return errors.New("shutting down") 2036 } 2037 2038 // Check here to see if we have a max HA Assets limit set. 2039 if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets > 0 { 2040 if s.numRaftNodes() > maxHaAssets { 2041 s.Warnf("Maximum HA Assets limit reached: %d", maxHaAssets) 2042 // Since the meta leader assigned this, send a statsz update to them to get them up to date. 2043 go s.sendStatszUpdate() 2044 return errors.New("system limit reached") 2045 } 2046 } 2047 2048 storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, rg.Name) 2049 var store StreamStore 2050 if storage == FileStorage { 2051 fs, err := newFileStoreWithCreated( 2052 FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncInterval: 5 * time.Minute, srv: s}, 2053 StreamConfig{Name: rg.Name, Storage: FileStorage, Metadata: labels}, 2054 time.Now().UTC(), 2055 s.jsKeyGen(s.getOpts().JetStreamKey, rg.Name), 2056 s.jsKeyGen(s.getOpts().JetStreamOldKey, rg.Name), 2057 ) 2058 if err != nil { 2059 s.Errorf("Error creating filestore WAL: %v", err) 2060 return err 2061 } 2062 store = fs 2063 } else { 2064 ms, err := newMemStore(&StreamConfig{Name: rg.Name, Storage: MemoryStorage}) 2065 if err != nil { 2066 s.Errorf("Error creating memstore WAL: %v", err) 2067 return err 2068 } 2069 store = ms 2070 } 2071 2072 cfg := &RaftConfig{Name: rg.Name, Store: storeDir, Log: store, Track: true} 2073 2074 if _, err := readPeerState(storeDir); err != nil { 2075 s.bootstrapRaftNode(cfg, rg.Peers, true) 2076 } 2077 2078 n, err := s.startRaftNode(accName, cfg, labels) 2079 if err != nil || n == nil { 2080 s.Debugf("Error creating raft group: %v", err) 2081 return err 2082 } 2083 // Need locking here for the assignment to avoid data-race reports 2084 js.mu.Lock() 2085 rg.node = n 2086 // See if we are preferred and should start campaign immediately. 2087 if n.ID() == rg.Preferred && n.Term() == 0 { 2088 n.Campaign() 2089 } 2090 js.mu.Unlock() 2091 return nil 2092 } 2093 2094 func (mset *stream) raftGroup() *raftGroup { 2095 if mset == nil { 2096 return nil 2097 } 2098 mset.mu.RLock() 2099 defer mset.mu.RUnlock() 2100 if mset.sa == nil { 2101 return nil 2102 } 2103 return mset.sa.Group 2104 } 2105 2106 func (mset *stream) raftNode() RaftNode { 2107 if mset == nil { 2108 return nil 2109 } 2110 mset.mu.RLock() 2111 defer mset.mu.RUnlock() 2112 return mset.node 2113 } 2114 2115 func (mset *stream) removeNode() { 2116 mset.mu.Lock() 2117 defer mset.mu.Unlock() 2118 if n := mset.node; n != nil { 2119 n.Delete() 2120 mset.node = nil 2121 } 2122 } 2123 2124 func (mset *stream) clearRaftNode() { 2125 if mset == nil { 2126 return 2127 } 2128 mset.mu.Lock() 2129 defer mset.mu.Unlock() 2130 mset.node = nil 2131 } 2132 2133 // Helper function to generate peer info. 2134 // lists and sets for old and new. 2135 func genPeerInfo(peers []string, split int) (newPeers, oldPeers []string, newPeerSet, oldPeerSet map[string]bool) { 2136 newPeers = peers[split:] 2137 oldPeers = peers[:split] 2138 newPeerSet = make(map[string]bool, len(newPeers)) 2139 oldPeerSet = make(map[string]bool, len(oldPeers)) 2140 for i, peer := range peers { 2141 if i < split { 2142 oldPeerSet[peer] = true 2143 } else { 2144 newPeerSet[peer] = true 2145 } 2146 } 2147 return 2148 } 2149 2150 // Monitor our stream node for this stream. 2151 func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnapshot bool) { 2152 s, cc := js.server(), js.cluster 2153 defer s.grWG.Done() 2154 if mset != nil { 2155 defer mset.monitorWg.Done() 2156 } 2157 js.mu.RLock() 2158 n := sa.Group.node 2159 meta := cc.meta 2160 js.mu.RUnlock() 2161 2162 if n == nil || meta == nil { 2163 s.Warnf("No RAFT group for '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name) 2164 return 2165 } 2166 2167 // Make sure only one is running. 2168 if mset != nil { 2169 if mset.checkInMonitor() { 2170 return 2171 } 2172 defer mset.clearMonitorRunning() 2173 } 2174 2175 // Make sure to stop the raft group on exit to prevent accidental memory bloat. 2176 // This should be below the checkInMonitor call though to avoid stopping it out 2177 // from underneath the one that is running since it will be the same raft node. 2178 defer n.Stop() 2179 2180 qch, mqch, lch, aq, uch, ourPeerId := n.QuitC(), mset.monitorQuitC(), n.LeadChangeC(), n.ApplyQ(), mset.updateC(), meta.ID() 2181 2182 s.Debugf("Starting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group()) 2183 defer s.Debugf("Exiting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group()) 2184 2185 // Make sure we do not leave the apply channel to fill up and block the raft layer. 2186 defer func() { 2187 if n.State() == Closed { 2188 return 2189 } 2190 if n.Leader() { 2191 n.StepDown() 2192 } 2193 // Drain the commit queue... 2194 aq.drain() 2195 }() 2196 2197 const ( 2198 compactInterval = 2 * time.Minute 2199 compactSizeMin = 8 * 1024 * 1024 2200 compactNumMin = 65536 2201 minSnapDelta = 10 * time.Second 2202 ) 2203 2204 // Spread these out for large numbers on server restart. 2205 rci := time.Duration(rand.Int63n(int64(time.Minute))) 2206 t := time.NewTicker(compactInterval + rci) 2207 defer t.Stop() 2208 2209 js.mu.RLock() 2210 isLeader := cc.isStreamLeader(sa.Client.serviceAccount(), sa.Config.Name) 2211 isRestore := sa.Restore != nil 2212 js.mu.RUnlock() 2213 2214 acc, err := s.LookupAccount(sa.Client.serviceAccount()) 2215 if err != nil { 2216 s.Warnf("Could not retrieve account for stream '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name) 2217 return 2218 } 2219 accName := acc.GetName() 2220 2221 // Used to represent how we can detect a changed state quickly and without representing 2222 // a complete and detailed state which could be costly in terms of memory, cpu and GC. 2223 // This only entails how many messages, and the first and last sequence of the stream. 2224 // This is all that is needed to detect a change, and we can get this from FilteredState() 2225 // with and empty filter. 2226 var lastState SimpleState 2227 var lastSnapTime time.Time 2228 2229 // Don't allow the upper layer to install snapshots until we have 2230 // fully recovered from disk. 2231 isRecovering := true 2232 2233 // Should only to be called from leader. 2234 doSnapshot := func() { 2235 if mset == nil || isRecovering || isRestore || time.Since(lastSnapTime) < minSnapDelta { 2236 return 2237 } 2238 2239 // Before we actually calculate the detailed state and encode it, let's check the 2240 // simple state to detect any changes. 2241 curState := mset.store.FilteredState(0, _EMPTY_) 2242 2243 // If the state hasn't changed but the log has gone way over 2244 // the compaction size then we will want to compact anyway. 2245 // This shouldn't happen for streams like it can for pull 2246 // consumers on idle streams but better to be safe than sorry! 2247 ne, nb := n.Size() 2248 if curState == lastState && ne < compactNumMin && nb < compactSizeMin { 2249 return 2250 } 2251 2252 if err := n.InstallSnapshot(mset.stateSnapshot()); err == nil { 2253 lastState, lastSnapTime = curState, time.Now() 2254 } else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning { 2255 s.RateLimitWarnf("Failed to install snapshot for '%s > %s' [%s]: %v", mset.acc.Name, mset.name(), n.Group(), err) 2256 } 2257 } 2258 2259 // We will establish a restoreDoneCh no matter what. Will never be triggered unless 2260 // we replace with the restore chan. 2261 restoreDoneCh := make(<-chan error) 2262 2263 // For migration tracking. 2264 var mmt *time.Ticker 2265 var mmtc <-chan time.Time 2266 2267 startMigrationMonitoring := func() { 2268 if mmt == nil { 2269 mmt = time.NewTicker(500 * time.Millisecond) 2270 mmtc = mmt.C 2271 } 2272 } 2273 2274 stopMigrationMonitoring := func() { 2275 if mmt != nil { 2276 mmt.Stop() 2277 mmt, mmtc = nil, nil 2278 } 2279 } 2280 defer stopMigrationMonitoring() 2281 2282 // This is to optionally track when we are ready as a non-leader for direct access participation. 2283 // Either direct or if we are a direct mirror, or both. 2284 var dat *time.Ticker 2285 var datc <-chan time.Time 2286 2287 startDirectAccessMonitoring := func() { 2288 if dat == nil { 2289 dat = time.NewTicker(2 * time.Second) 2290 datc = dat.C 2291 } 2292 } 2293 2294 stopDirectMonitoring := func() { 2295 if dat != nil { 2296 dat.Stop() 2297 dat, datc = nil, nil 2298 } 2299 } 2300 defer stopDirectMonitoring() 2301 2302 // Check if we are interest based and if so and we have an active stream wait until we 2303 // have the consumers attached. This can become important when a server has lots of assets 2304 // since we process streams first then consumers as an asset class. 2305 if mset != nil && mset.isInterestRetention() { 2306 js.mu.RLock() 2307 numExpectedConsumers := len(sa.consumers) 2308 js.mu.RUnlock() 2309 if mset.numConsumers() < numExpectedConsumers { 2310 s.Debugf("Waiting for consumers for interest based stream '%s > %s'", accName, mset.name()) 2311 // Wait up to 10s 2312 const maxWaitTime = 10 * time.Second 2313 const sleepTime = 250 * time.Millisecond 2314 timeout := time.Now().Add(maxWaitTime) 2315 for time.Now().Before(timeout) { 2316 if mset.numConsumers() >= numExpectedConsumers { 2317 break 2318 } 2319 select { 2320 case <-s.quitCh: 2321 return 2322 case <-time.After(sleepTime): 2323 } 2324 } 2325 if actual := mset.numConsumers(); actual < numExpectedConsumers { 2326 s.Warnf("All consumers not online for '%s > %s': expected %d but only have %d", accName, mset.name(), numExpectedConsumers, actual) 2327 } 2328 } 2329 } 2330 2331 // This is triggered during a scale up from R1 to clustered mode. We need the new followers to catchup, 2332 // similar to how we trigger the catchup mechanism post a backup/restore. 2333 // We can arrive here NOT being the leader, so we send the snapshot only if we are, and in this case 2334 // reset the notion that we need to send the snapshot. If we are not, then the first time the server 2335 // will switch to leader (in the loop below), we will send the snapshot. 2336 if sendSnapshot && isLeader && mset != nil && n != nil && !isRecovering { 2337 n.SendSnapshot(mset.stateSnapshot()) 2338 sendSnapshot = false 2339 } 2340 2341 for { 2342 select { 2343 case <-s.quitCh: 2344 return 2345 case <-mqch: 2346 return 2347 case <-qch: 2348 return 2349 case <-aq.ch: 2350 var ne, nb uint64 2351 ces := aq.pop() 2352 for _, ce := range ces { 2353 // No special processing needed for when we are caught up on restart. 2354 if ce == nil { 2355 isRecovering = false 2356 // Make sure we create a new snapshot in case things have changed such that any existing 2357 // snapshot may no longer be valid. 2358 doSnapshot() 2359 // If we became leader during this time and we need to send a snapshot to our 2360 // followers, i.e. as a result of a scale-up from R1, do it now. 2361 if sendSnapshot && isLeader && mset != nil && n != nil { 2362 n.SendSnapshot(mset.stateSnapshot()) 2363 sendSnapshot = false 2364 } 2365 continue 2366 } 2367 // Apply our entries. 2368 if err := js.applyStreamEntries(mset, ce, isRecovering); err == nil { 2369 // Update our applied. 2370 ne, nb = n.Applied(ce.Index) 2371 ce.ReturnToPool() 2372 } else { 2373 // Our stream was closed out from underneath of us, simply return here. 2374 if err == errStreamClosed { 2375 return 2376 } 2377 s.Warnf("Error applying entries to '%s > %s': %v", accName, sa.Config.Name, err) 2378 if isClusterResetErr(err) { 2379 if mset.isMirror() && mset.IsLeader() { 2380 mset.retryMirrorConsumer() 2381 continue 2382 } 2383 // We will attempt to reset our cluster state. 2384 if mset.resetClusteredState(err) { 2385 aq.recycle(&ces) 2386 return 2387 } 2388 } else if isOutOfSpaceErr(err) { 2389 // If applicable this will tear all of this down, but don't assume so and return. 2390 s.handleOutOfSpace(mset) 2391 } 2392 } 2393 } 2394 aq.recycle(&ces) 2395 2396 // Check about snapshotting 2397 // If we have at least min entries to compact, go ahead and try to snapshot/compact. 2398 if ne >= compactNumMin || nb > compactSizeMin { 2399 doSnapshot() 2400 } 2401 2402 case isLeader = <-lch: 2403 if isLeader { 2404 if mset != nil && n != nil && sendSnapshot && !isRecovering { 2405 // If we *are* recovering at the time then this will get done when the apply queue 2406 // handles the nil guard to show the catchup ended. 2407 n.SendSnapshot(mset.stateSnapshot()) 2408 sendSnapshot = false 2409 } 2410 if isRestore { 2411 acc, _ := s.LookupAccount(sa.Client.serviceAccount()) 2412 restoreDoneCh = s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_) 2413 continue 2414 } else if n != nil && n.NeedSnapshot() { 2415 doSnapshot() 2416 } 2417 // Always cancel if this was running. 2418 stopDirectMonitoring() 2419 2420 } else if n.GroupLeader() != noLeader { 2421 js.setStreamAssignmentRecovering(sa) 2422 } 2423 2424 // Process our leader change. 2425 js.processStreamLeaderChange(mset, isLeader) 2426 2427 // We may receive a leader change after the stream assignment which would cancel us 2428 // monitoring for this closely. So re-assess our state here as well. 2429 // Or the old leader is no longer part of the set and transferred leadership 2430 // for this leader to resume with removal 2431 migrating := mset.isMigrating() 2432 2433 // Check for migrations here. We set the state on the stream assignment update below. 2434 if isLeader && migrating { 2435 startMigrationMonitoring() 2436 } 2437 2438 // Here we are checking if we are not the leader but we have been asked to allow 2439 // direct access. We now allow non-leaders to participate in the queue group. 2440 if !isLeader && mset != nil { 2441 mset.mu.RLock() 2442 ad, md := mset.cfg.AllowDirect, mset.cfg.MirrorDirect 2443 mset.mu.RUnlock() 2444 if ad || md { 2445 startDirectAccessMonitoring() 2446 } 2447 } 2448 2449 case <-datc: 2450 if mset == nil || isRecovering { 2451 continue 2452 } 2453 // If we are leader we can stop, we know this is setup now. 2454 if isLeader { 2455 stopDirectMonitoring() 2456 continue 2457 } 2458 2459 mset.mu.Lock() 2460 ad, md, current := mset.cfg.AllowDirect, mset.cfg.MirrorDirect, mset.isCurrent() 2461 if !current { 2462 const syncThreshold = 90.0 2463 // We are not current, but current means exactly caught up. Under heavy publish 2464 // loads we may never reach this, so check if we are within 90% caught up. 2465 _, c, a := mset.node.Progress() 2466 if c == 0 { 2467 mset.mu.Unlock() 2468 continue 2469 } 2470 if p := float64(a) / float64(c) * 100.0; p < syncThreshold { 2471 mset.mu.Unlock() 2472 continue 2473 } else { 2474 s.Debugf("Stream '%s > %s' enabling direct gets at %.0f%% synchronized", 2475 sa.Client.serviceAccount(), sa.Config.Name, p) 2476 } 2477 } 2478 // We are current, cancel monitoring and create the direct subs as needed. 2479 if ad { 2480 mset.subscribeToDirect() 2481 } 2482 if md { 2483 mset.subscribeToMirrorDirect() 2484 } 2485 mset.mu.Unlock() 2486 // Stop direct monitoring. 2487 stopDirectMonitoring() 2488 2489 case <-t.C: 2490 doSnapshot() 2491 2492 case <-uch: 2493 // keep stream assignment current 2494 sa = mset.streamAssignment() 2495 2496 // keep peer list up to date with config 2497 js.checkPeers(mset.raftGroup()) 2498 // We get this when we have a new stream assignment caused by an update. 2499 // We want to know if we are migrating. 2500 if migrating := mset.isMigrating(); migrating { 2501 if isLeader && mmtc == nil { 2502 startMigrationMonitoring() 2503 } 2504 } else { 2505 stopMigrationMonitoring() 2506 } 2507 case <-mmtc: 2508 if !isLeader { 2509 // We are no longer leader, so not our job. 2510 stopMigrationMonitoring() 2511 continue 2512 } 2513 2514 // Check to see where we are.. 2515 rg := mset.raftGroup() 2516 2517 // Track the new peers and check the ones that are current. 2518 mset.mu.RLock() 2519 replicas := mset.cfg.Replicas 2520 mset.mu.RUnlock() 2521 if len(rg.Peers) <= replicas { 2522 // Migration no longer happening, so not our job anymore 2523 stopMigrationMonitoring() 2524 continue 2525 } 2526 2527 // Make sure we have correct cluster information on the other peers. 2528 ci := js.clusterInfo(rg) 2529 mset.checkClusterInfo(ci) 2530 2531 newPeers, oldPeers, newPeerSet, oldPeerSet := genPeerInfo(rg.Peers, len(rg.Peers)-replicas) 2532 2533 // If we are part of the new peerset and we have been passed the baton. 2534 // We will handle scale down. 2535 if newPeerSet[ourPeerId] { 2536 // First need to check on any consumers and make sure they have moved properly before scaling down ourselves. 2537 js.mu.RLock() 2538 var needToWait bool 2539 for name, c := range sa.consumers { 2540 for _, peer := range c.Group.Peers { 2541 // If we have peers still in the old set block. 2542 if oldPeerSet[peer] { 2543 s.Debugf("Scale down of '%s > %s' blocked by consumer '%s'", accName, sa.Config.Name, name) 2544 needToWait = true 2545 break 2546 } 2547 } 2548 if needToWait { 2549 break 2550 } 2551 } 2552 js.mu.RUnlock() 2553 if needToWait { 2554 continue 2555 } 2556 2557 // We are good to go, can scale down here. 2558 for _, p := range oldPeers { 2559 n.ProposeRemovePeer(p) 2560 } 2561 2562 csa := sa.copyGroup() 2563 csa.Group.Peers = newPeers 2564 csa.Group.Preferred = ourPeerId 2565 csa.Group.Cluster = s.cachedClusterName() 2566 cc.meta.ForwardProposal(encodeUpdateStreamAssignment(csa)) 2567 s.Noticef("Scaling down '%s > %s' to %+v", accName, sa.Config.Name, s.peerSetToNames(newPeers)) 2568 } else { 2569 // We are the old leader here, from the original peer set. 2570 // We are simply waiting on the new peerset to be caught up so we can transfer leadership. 2571 var newLeaderPeer, newLeader string 2572 neededCurrent, current := replicas/2+1, 0 2573 2574 for _, r := range ci.Replicas { 2575 if r.Current && newPeerSet[r.Peer] { 2576 current++ 2577 if newLeader == _EMPTY_ { 2578 newLeaderPeer, newLeader = r.Peer, r.Name 2579 } 2580 } 2581 } 2582 // Check if we have a quorom. 2583 if current >= neededCurrent { 2584 s.Noticef("Transfer of stream leader for '%s > %s' to '%s'", accName, sa.Config.Name, newLeader) 2585 n.UpdateKnownPeers(newPeers) 2586 n.StepDown(newLeaderPeer) 2587 } 2588 } 2589 2590 case err := <-restoreDoneCh: 2591 // We have completed a restore from snapshot on this server. The stream assignment has 2592 // already been assigned but the replicas will need to catch up out of band. Consumers 2593 // will need to be assigned by forwarding the proposal and stamping the initial state. 2594 s.Debugf("Stream restore for '%s > %s' completed", sa.Client.serviceAccount(), sa.Config.Name) 2595 if err != nil { 2596 s.Debugf("Stream restore failed: %v", err) 2597 } 2598 isRestore = false 2599 sa.Restore = nil 2600 // If we were successful lookup up our stream now. 2601 if err == nil { 2602 if mset, err = acc.lookupStream(sa.Config.Name); mset != nil { 2603 mset.monitorWg.Add(1) 2604 defer mset.monitorWg.Done() 2605 mset.setStreamAssignment(sa) 2606 // Make sure to update our updateC which would have been nil. 2607 uch = mset.updateC() 2608 // Also update our mqch 2609 mqch = mset.monitorQuitC() 2610 } 2611 } 2612 if err != nil { 2613 if mset != nil { 2614 mset.delete() 2615 } 2616 js.mu.Lock() 2617 sa.err = err 2618 if n != nil { 2619 n.Delete() 2620 } 2621 result := &streamAssignmentResult{ 2622 Account: sa.Client.serviceAccount(), 2623 Stream: sa.Config.Name, 2624 Restore: &JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}}, 2625 } 2626 result.Restore.Error = NewJSStreamAssignmentError(err, Unless(err)) 2627 js.mu.Unlock() 2628 // Send response to the metadata leader. They will forward to the user as needed. 2629 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result) 2630 return 2631 } 2632 2633 if !isLeader { 2634 panic("Finished restore but not leader") 2635 } 2636 // Trigger the stream followers to catchup. 2637 if n = mset.raftNode(); n != nil { 2638 n.SendSnapshot(mset.stateSnapshot()) 2639 } 2640 js.processStreamLeaderChange(mset, isLeader) 2641 2642 // Check to see if we have restored consumers here. 2643 // These are not currently assigned so we will need to do so here. 2644 if consumers := mset.getPublicConsumers(); len(consumers) > 0 { 2645 for _, o := range consumers { 2646 name, cfg := o.String(), o.config() 2647 rg := cc.createGroupForConsumer(&cfg, sa) 2648 // Pick a preferred leader. 2649 rg.setPreferred() 2650 2651 // Place our initial state here as well for assignment distribution. 2652 state, _ := o.store.State() 2653 ca := &consumerAssignment{ 2654 Group: rg, 2655 Stream: sa.Config.Name, 2656 Name: name, 2657 Config: &cfg, 2658 Client: sa.Client, 2659 Created: o.createdTime(), 2660 State: state, 2661 } 2662 2663 // We make these compressed in case state is complex. 2664 addEntry := encodeAddConsumerAssignmentCompressed(ca) 2665 cc.meta.ForwardProposal(addEntry) 2666 2667 // Check to make sure we see the assignment. 2668 go func() { 2669 ticker := time.NewTicker(time.Second) 2670 defer ticker.Stop() 2671 for range ticker.C { 2672 js.mu.RLock() 2673 ca, meta := js.consumerAssignment(ca.Client.serviceAccount(), sa.Config.Name, name), cc.meta 2674 js.mu.RUnlock() 2675 if ca == nil { 2676 s.Warnf("Consumer assignment has not been assigned, retrying") 2677 if meta != nil { 2678 meta.ForwardProposal(addEntry) 2679 } else { 2680 return 2681 } 2682 } else { 2683 return 2684 } 2685 } 2686 }() 2687 } 2688 } 2689 } 2690 } 2691 } 2692 2693 // Determine if we are migrating 2694 func (mset *stream) isMigrating() bool { 2695 if mset == nil { 2696 return false 2697 } 2698 2699 mset.mu.RLock() 2700 js, sa := mset.js, mset.sa 2701 mset.mu.RUnlock() 2702 2703 js.mu.RLock() 2704 defer js.mu.RUnlock() 2705 2706 // During migration we will always be R>1, even when we start R1. 2707 // So if we do not have a group or node we no we are not migrating. 2708 if sa == nil || sa.Group == nil || sa.Group.node == nil { 2709 return false 2710 } 2711 // The sign of migration is if our group peer count != configured replica count. 2712 if sa.Config.Replicas == len(sa.Group.Peers) { 2713 return false 2714 } 2715 return true 2716 } 2717 2718 // resetClusteredState is called when a clustered stream had an error (e.g sequence mismatch, bad snapshot) and needs to be reset. 2719 func (mset *stream) resetClusteredState(err error) bool { 2720 mset.mu.RLock() 2721 s, js, jsa, sa, acc, node := mset.srv, mset.js, mset.jsa, mset.sa, mset.acc, mset.node 2722 stype, isLeader, tierName, replicas := mset.cfg.Storage, mset.isLeader(), mset.tier, mset.cfg.Replicas 2723 mset.mu.RUnlock() 2724 2725 // Stepdown regardless if we are the leader here. 2726 if isLeader && node != nil { 2727 node.StepDown() 2728 } 2729 2730 // If we detect we are shutting down just return. 2731 if js != nil && js.isShuttingDown() { 2732 s.Debugf("Will not reset stream, jetstream shutting down") 2733 return false 2734 } 2735 2736 // Server 2737 if js.limitsExceeded(stype) { 2738 s.Warnf("Will not reset stream, server resources exceeded") 2739 return false 2740 } 2741 2742 // Account 2743 if exceeded, _ := jsa.limitsExceeded(stype, tierName, replicas); exceeded { 2744 s.Warnf("stream '%s > %s' errored, account resources exceeded", acc, mset.name()) 2745 return false 2746 } 2747 2748 // We delete our raft state. Will recreate. 2749 if node != nil { 2750 node.Delete() 2751 } 2752 2753 // Preserve our current state and messages unless we have a first sequence mismatch. 2754 shouldDelete := err == errFirstSequenceMismatch 2755 2756 // Need to do the rest in a separate Go routine. 2757 go func() { 2758 mset.monitorWg.Wait() 2759 mset.resetAndWaitOnConsumers() 2760 // Stop our stream. 2761 mset.stop(shouldDelete, false) 2762 2763 if sa != nil { 2764 js.mu.Lock() 2765 if js.shuttingDown { 2766 js.mu.Unlock() 2767 return 2768 } 2769 2770 s.Warnf("Resetting stream cluster state for '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name) 2771 // Now wipe groups from assignments. 2772 sa.Group.node = nil 2773 var consumers []*consumerAssignment 2774 if cc := js.cluster; cc != nil && cc.meta != nil { 2775 ourID := cc.meta.ID() 2776 for _, ca := range sa.consumers { 2777 if rg := ca.Group; rg != nil && rg.isMember(ourID) { 2778 rg.node = nil // Erase group raft/node state. 2779 consumers = append(consumers, ca) 2780 } 2781 } 2782 } 2783 js.mu.Unlock() 2784 2785 // This will reset the stream and consumers. 2786 // Reset stream. 2787 js.processClusterCreateStream(acc, sa) 2788 // Reset consumers. 2789 for _, ca := range consumers { 2790 js.processClusterCreateConsumer(ca, nil, false) 2791 } 2792 } 2793 }() 2794 2795 return true 2796 } 2797 2798 func isControlHdr(hdr []byte) bool { 2799 return bytes.HasPrefix(hdr, []byte("NATS/1.0 100 ")) 2800 } 2801 2802 // Apply our stream entries. 2803 func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isRecovering bool) error { 2804 for _, e := range ce.Entries { 2805 if e.Type == EntryNormal { 2806 buf, op := e.Data, entryOp(e.Data[0]) 2807 switch op { 2808 case streamMsgOp, compressedStreamMsgOp: 2809 if mset == nil { 2810 continue 2811 } 2812 s := js.srv 2813 2814 mbuf := buf[1:] 2815 if op == compressedStreamMsgOp { 2816 var err error 2817 mbuf, err = s2.Decode(nil, mbuf) 2818 if err != nil { 2819 panic(err.Error()) 2820 } 2821 } 2822 2823 subject, reply, hdr, msg, lseq, ts, err := decodeStreamMsg(mbuf) 2824 if err != nil { 2825 if node := mset.raftNode(); node != nil { 2826 s.Errorf("JetStream cluster could not decode stream msg for '%s > %s' [%s]", 2827 mset.account(), mset.name(), node.Group()) 2828 } 2829 panic(err.Error()) 2830 } 2831 2832 // Check for flowcontrol here. 2833 if len(msg) == 0 && len(hdr) > 0 && reply != _EMPTY_ && isControlHdr(hdr) { 2834 if !isRecovering { 2835 mset.sendFlowControlReply(reply) 2836 } 2837 continue 2838 } 2839 2840 // Grab last sequence and CLFS. 2841 last, clfs := mset.lastSeqAndCLFS() 2842 2843 // We can skip if we know this is less than what we already have. 2844 if lseq-clfs < last { 2845 s.Debugf("Apply stream entries for '%s > %s' skipping message with sequence %d with last of %d", 2846 mset.account(), mset.name(), lseq+1-clfs, last) 2847 2848 mset.mu.Lock() 2849 // Check for any preAcks in case we are interest based. 2850 mset.clearAllPreAcks(lseq + 1 - mset.clfs) 2851 mset.mu.Unlock() 2852 continue 2853 } 2854 2855 // Skip by hand here since first msg special case. 2856 // Reason is sequence is unsigned and for lseq being 0 2857 // the lseq under stream would have to be -1. 2858 if lseq == 0 && last != 0 { 2859 continue 2860 } 2861 2862 // Messages to be skipped have no subject or timestamp or msg or hdr. 2863 if subject == _EMPTY_ && ts == 0 && len(msg) == 0 && len(hdr) == 0 { 2864 // Skip and update our lseq. 2865 last := mset.store.SkipMsg() 2866 mset.setLastSeq(last) 2867 mset.clearAllPreAcks(last) 2868 continue 2869 } 2870 2871 var mt *msgTrace 2872 // If not recovering, see if we find a message trace object for this 2873 // sequence. Only the leader that has proposed this entry will have 2874 // stored the trace info. 2875 if !isRecovering { 2876 mt = mset.getAndDeleteMsgTrace(lseq) 2877 } 2878 // Process the actual message here. 2879 err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt) 2880 2881 // If we have inflight make sure to clear after processing. 2882 // TODO(dlc) - technically check on inflight != nil could cause datarace. 2883 // But do not want to acquire lock since tracking this will be rare. 2884 if mset.inflight != nil { 2885 mset.clMu.Lock() 2886 delete(mset.inflight, lseq) 2887 mset.clMu.Unlock() 2888 } 2889 2890 if err != nil { 2891 if err == errLastSeqMismatch { 2892 var state StreamState 2893 mset.store.FastState(&state) 2894 // If we have no msgs and the other side is delivering us a sequence past where we 2895 // should be reset. This is possible if the other side has a stale snapshot and no longer 2896 // has those messages. So compact and retry to reset. 2897 if state.Msgs == 0 { 2898 mset.store.Compact(lseq + 1) 2899 // Retry 2900 err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt) 2901 } 2902 } 2903 2904 // Only return in place if we are going to reset our stream or we are out of space, or we are closed. 2905 if isClusterResetErr(err) || isOutOfSpaceErr(err) || err == errStreamClosed { 2906 return err 2907 } 2908 s.Debugf("Apply stream entries for '%s > %s' got error processing message: %v", 2909 mset.account(), mset.name(), err) 2910 } 2911 2912 case deleteMsgOp: 2913 md, err := decodeMsgDelete(buf[1:]) 2914 if err != nil { 2915 if node := mset.raftNode(); node != nil { 2916 s := js.srv 2917 s.Errorf("JetStream cluster could not decode delete msg for '%s > %s' [%s]", 2918 mset.account(), mset.name(), node.Group()) 2919 } 2920 panic(err.Error()) 2921 } 2922 s, cc := js.server(), js.cluster 2923 2924 var removed bool 2925 if md.NoErase { 2926 removed, err = mset.removeMsg(md.Seq) 2927 } else { 2928 removed, err = mset.eraseMsg(md.Seq) 2929 } 2930 2931 // Cluster reset error. 2932 if err == ErrStoreEOF { 2933 return err 2934 } 2935 2936 if err != nil && !isRecovering { 2937 s.Debugf("JetStream cluster failed to delete stream msg %d from '%s > %s': %v", 2938 md.Seq, md.Client.serviceAccount(), md.Stream, err) 2939 } 2940 2941 js.mu.RLock() 2942 isLeader := cc.isStreamLeader(md.Client.serviceAccount(), md.Stream) 2943 js.mu.RUnlock() 2944 2945 if isLeader && !isRecovering { 2946 var resp = JSApiMsgDeleteResponse{ApiResponse: ApiResponse{Type: JSApiMsgDeleteResponseType}} 2947 if err != nil { 2948 resp.Error = NewJSStreamMsgDeleteFailedError(err, Unless(err)) 2949 s.sendAPIErrResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp)) 2950 } else if !removed { 2951 resp.Error = NewJSSequenceNotFoundError(md.Seq) 2952 s.sendAPIErrResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp)) 2953 } else { 2954 resp.Success = true 2955 s.sendAPIResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp)) 2956 } 2957 } 2958 case purgeStreamOp: 2959 sp, err := decodeStreamPurge(buf[1:]) 2960 if err != nil { 2961 if node := mset.raftNode(); node != nil { 2962 s := js.srv 2963 s.Errorf("JetStream cluster could not decode purge msg for '%s > %s' [%s]", 2964 mset.account(), mset.name(), node.Group()) 2965 } 2966 panic(err.Error()) 2967 } 2968 // If no explicit request, fill in with leader stamped last sequence to protect ourselves on replay during server start. 2969 if sp.Request == nil || sp.Request.Sequence == 0 { 2970 purgeSeq := sp.LastSeq + 1 2971 if sp.Request == nil { 2972 sp.Request = &JSApiStreamPurgeRequest{Sequence: purgeSeq} 2973 } else if sp.Request.Keep == 0 { 2974 sp.Request.Sequence = purgeSeq 2975 } else if isRecovering { 2976 continue 2977 } 2978 } 2979 2980 s := js.server() 2981 purged, err := mset.purge(sp.Request) 2982 if err != nil { 2983 s.Warnf("JetStream cluster failed to purge stream %q for account %q: %v", sp.Stream, sp.Client.serviceAccount(), err) 2984 } 2985 2986 js.mu.RLock() 2987 isLeader := js.cluster.isStreamLeader(sp.Client.serviceAccount(), sp.Stream) 2988 js.mu.RUnlock() 2989 2990 if isLeader && !isRecovering { 2991 var resp = JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}} 2992 if err != nil { 2993 resp.Error = NewJSStreamGeneralError(err, Unless(err)) 2994 s.sendAPIErrResponse(sp.Client, mset.account(), sp.Subject, sp.Reply, _EMPTY_, s.jsonResponse(resp)) 2995 } else { 2996 resp.Purged = purged 2997 resp.Success = true 2998 s.sendAPIResponse(sp.Client, mset.account(), sp.Subject, sp.Reply, _EMPTY_, s.jsonResponse(resp)) 2999 } 3000 } 3001 default: 3002 panic(fmt.Sprintf("JetStream Cluster Unknown group entry op type: %v", op)) 3003 } 3004 } else if e.Type == EntrySnapshot { 3005 if mset == nil { 3006 return nil 3007 } 3008 3009 // Everything operates on new replicated state. Will convert legacy snapshots to this for processing. 3010 var ss *StreamReplicatedState 3011 3012 onBadState := func(err error) { 3013 // If we are the leader or recovering, meaning we own the snapshot, 3014 // we should stepdown and clear our raft state since our snapshot is bad. 3015 if isRecovering || mset.IsLeader() { 3016 mset.mu.RLock() 3017 s, accName, streamName := mset.srv, mset.acc.GetName(), mset.cfg.Name 3018 mset.mu.RUnlock() 3019 s.Warnf("Detected bad stream state, resetting '%s > %s'", accName, streamName) 3020 mset.resetClusteredState(err) 3021 } 3022 } 3023 3024 // Check if we are the new binary encoding. 3025 if IsEncodedStreamState(e.Data) { 3026 var err error 3027 ss, err = DecodeStreamState(e.Data) 3028 if err != nil { 3029 onBadState(err) 3030 return err 3031 } 3032 } else { 3033 var snap streamSnapshot 3034 if err := json.Unmarshal(e.Data, &snap); err != nil { 3035 onBadState(err) 3036 return err 3037 } 3038 // Convert over to StreamReplicatedState 3039 ss = &StreamReplicatedState{ 3040 Msgs: snap.Msgs, 3041 Bytes: snap.Bytes, 3042 FirstSeq: snap.FirstSeq, 3043 LastSeq: snap.LastSeq, 3044 Failed: snap.Failed, 3045 } 3046 if len(snap.Deleted) > 0 { 3047 ss.Deleted = append(ss.Deleted, DeleteSlice(snap.Deleted)) 3048 } 3049 } 3050 3051 if !isRecovering && !mset.IsLeader() { 3052 if err := mset.processSnapshot(ss); err != nil { 3053 return err 3054 } 3055 } else if isRecovering { 3056 // On recovery, reset CLFS/FAILED. 3057 mset.setCLFS(ss.Failed) 3058 } 3059 } else if e.Type == EntryRemovePeer { 3060 js.mu.RLock() 3061 var ourID string 3062 if js.cluster != nil && js.cluster.meta != nil { 3063 ourID = js.cluster.meta.ID() 3064 } 3065 js.mu.RUnlock() 3066 // We only need to do processing if this is us. 3067 if peer := string(e.Data); peer == ourID && mset != nil { 3068 // Double check here with the registered stream assignment. 3069 shouldRemove := true 3070 if sa := mset.streamAssignment(); sa != nil && sa.Group != nil { 3071 js.mu.RLock() 3072 shouldRemove = !sa.Group.isMember(ourID) 3073 js.mu.RUnlock() 3074 } 3075 if shouldRemove { 3076 mset.stop(true, false) 3077 } 3078 } 3079 return nil 3080 } 3081 } 3082 return nil 3083 } 3084 3085 // Returns the PeerInfo for all replicas of a raft node. This is different than node.Peers() 3086 // and is used for external facing advisories. 3087 func (s *Server) replicas(node RaftNode) []*PeerInfo { 3088 now := time.Now() 3089 var replicas []*PeerInfo 3090 for _, rp := range node.Peers() { 3091 if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil { 3092 si := sir.(nodeInfo) 3093 pi := &PeerInfo{Peer: rp.ID, Name: si.name, Current: rp.Current, Active: now.Sub(rp.Last), Offline: si.offline, Lag: rp.Lag} 3094 replicas = append(replicas, pi) 3095 } 3096 } 3097 return replicas 3098 } 3099 3100 // Will check our node peers and see if we should remove a peer. 3101 func (js *jetStream) checkPeers(rg *raftGroup) { 3102 js.mu.Lock() 3103 defer js.mu.Unlock() 3104 3105 // FIXME(dlc) - Single replicas? 3106 if rg == nil || rg.node == nil { 3107 return 3108 } 3109 for _, peer := range rg.node.Peers() { 3110 if !rg.isMember(peer.ID) { 3111 rg.node.ProposeRemovePeer(peer.ID) 3112 } 3113 } 3114 } 3115 3116 // Process a leader change for the clustered stream. 3117 func (js *jetStream) processStreamLeaderChange(mset *stream, isLeader bool) { 3118 if mset == nil { 3119 return 3120 } 3121 sa := mset.streamAssignment() 3122 if sa == nil { 3123 return 3124 } 3125 3126 // Clear inflight if we have it. 3127 mset.clMu.Lock() 3128 mset.inflight = nil 3129 mset.clMu.Unlock() 3130 3131 js.mu.Lock() 3132 s, account, err := js.srv, sa.Client.serviceAccount(), sa.err 3133 client, subject, reply := sa.Client, sa.Subject, sa.Reply 3134 hasResponded := sa.responded 3135 sa.responded = true 3136 peers := copyStrings(sa.Group.Peers) 3137 js.mu.Unlock() 3138 3139 streamName := mset.name() 3140 3141 if isLeader { 3142 s.Noticef("JetStream cluster new stream leader for '%s > %s'", account, streamName) 3143 s.sendStreamLeaderElectAdvisory(mset) 3144 // Check for peer removal and process here if needed. 3145 js.checkPeers(sa.Group) 3146 mset.checkAllowMsgCompress(peers) 3147 } else { 3148 // We are stepping down. 3149 // Make sure if we are doing so because we have lost quorum that we send the appropriate advisories. 3150 if node := mset.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second { 3151 s.sendStreamLostQuorumAdvisory(mset) 3152 } 3153 } 3154 3155 // Tell stream to switch leader status. 3156 mset.setLeader(isLeader) 3157 3158 if !isLeader || hasResponded { 3159 return 3160 } 3161 3162 acc, _ := s.LookupAccount(account) 3163 if acc == nil { 3164 return 3165 } 3166 3167 // Send our response. 3168 var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}} 3169 if err != nil { 3170 resp.Error = NewJSStreamCreateError(err, Unless(err)) 3171 s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 3172 } else { 3173 resp.StreamInfo = &StreamInfo{ 3174 Created: mset.createdTime(), 3175 State: mset.state(), 3176 Config: mset.config(), 3177 Cluster: js.clusterInfo(mset.raftGroup()), 3178 Sources: mset.sourcesInfo(), 3179 Mirror: mset.mirrorInfo(), 3180 TimeStamp: time.Now().UTC(), 3181 } 3182 resp.DidCreate = true 3183 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 3184 if node := mset.raftNode(); node != nil { 3185 mset.sendCreateAdvisory() 3186 } 3187 } 3188 } 3189 3190 // Fixed value ok for now. 3191 const lostQuorumAdvInterval = 10 * time.Second 3192 3193 // Determines if we should send lost quorum advisory. We throttle these after first one. 3194 func (mset *stream) shouldSendLostQuorum() bool { 3195 mset.mu.Lock() 3196 defer mset.mu.Unlock() 3197 if time.Since(mset.lqsent) >= lostQuorumAdvInterval { 3198 mset.lqsent = time.Now() 3199 return true 3200 } 3201 return false 3202 } 3203 3204 func (s *Server) sendStreamLostQuorumAdvisory(mset *stream) { 3205 if mset == nil { 3206 return 3207 } 3208 node, stream, acc := mset.raftNode(), mset.name(), mset.account() 3209 if node == nil { 3210 return 3211 } 3212 if !mset.shouldSendLostQuorum() { 3213 return 3214 } 3215 3216 s.Warnf("JetStream cluster stream '%s > %s' has NO quorum, stalled", acc.GetName(), stream) 3217 3218 subj := JSAdvisoryStreamQuorumLostPre + "." + stream 3219 adv := &JSStreamQuorumLostAdvisory{ 3220 TypedEvent: TypedEvent{ 3221 Type: JSStreamQuorumLostAdvisoryType, 3222 ID: nuid.Next(), 3223 Time: time.Now().UTC(), 3224 }, 3225 Stream: stream, 3226 Replicas: s.replicas(node), 3227 Domain: s.getOpts().JetStreamDomain, 3228 } 3229 3230 // Send to the user's account if not the system account. 3231 if acc != s.SystemAccount() { 3232 s.publishAdvisory(acc, subj, adv) 3233 } 3234 // Now do system level one. Place account info in adv, and nil account means system. 3235 adv.Account = acc.GetName() 3236 s.publishAdvisory(nil, subj, adv) 3237 } 3238 3239 func (s *Server) sendStreamLeaderElectAdvisory(mset *stream) { 3240 if mset == nil { 3241 return 3242 } 3243 node, stream, acc := mset.raftNode(), mset.name(), mset.account() 3244 if node == nil { 3245 return 3246 } 3247 subj := JSAdvisoryStreamLeaderElectedPre + "." + stream 3248 adv := &JSStreamLeaderElectedAdvisory{ 3249 TypedEvent: TypedEvent{ 3250 Type: JSStreamLeaderElectedAdvisoryType, 3251 ID: nuid.Next(), 3252 Time: time.Now().UTC(), 3253 }, 3254 Stream: stream, 3255 Leader: s.serverNameForNode(node.GroupLeader()), 3256 Replicas: s.replicas(node), 3257 Domain: s.getOpts().JetStreamDomain, 3258 } 3259 3260 // Send to the user's account if not the system account. 3261 if acc != s.SystemAccount() { 3262 s.publishAdvisory(acc, subj, adv) 3263 } 3264 // Now do system level one. Place account info in adv, and nil account means system. 3265 adv.Account = acc.GetName() 3266 s.publishAdvisory(nil, subj, adv) 3267 } 3268 3269 // Will lookup a stream assignment. 3270 // Lock should be held. 3271 func (js *jetStream) streamAssignment(account, stream string) (sa *streamAssignment) { 3272 cc := js.cluster 3273 if cc == nil { 3274 return nil 3275 } 3276 3277 if as := cc.streams[account]; as != nil { 3278 sa = as[stream] 3279 } 3280 return sa 3281 } 3282 3283 // processStreamAssignment is called when followers have replicated an assignment. 3284 func (js *jetStream) processStreamAssignment(sa *streamAssignment) bool { 3285 js.mu.Lock() 3286 s, cc := js.srv, js.cluster 3287 accName, stream := sa.Client.serviceAccount(), sa.Config.Name 3288 noMeta := cc == nil || cc.meta == nil 3289 var ourID string 3290 if !noMeta { 3291 ourID = cc.meta.ID() 3292 } 3293 var isMember bool 3294 if sa.Group != nil && ourID != _EMPTY_ { 3295 isMember = sa.Group.isMember(ourID) 3296 } 3297 3298 // Remove this stream from the inflight proposals 3299 cc.removeInflightProposal(accName, sa.Config.Name) 3300 3301 if s == nil || noMeta { 3302 js.mu.Unlock() 3303 return false 3304 } 3305 3306 accStreams := cc.streams[accName] 3307 if accStreams == nil { 3308 accStreams = make(map[string]*streamAssignment) 3309 } else if osa := accStreams[stream]; osa != nil && osa != sa { 3310 // Copy over private existing state from former SA. 3311 if sa.Group != nil { 3312 sa.Group.node = osa.Group.node 3313 } 3314 sa.consumers = osa.consumers 3315 sa.responded = osa.responded 3316 sa.err = osa.err 3317 } 3318 3319 // Update our state. 3320 accStreams[stream] = sa 3321 cc.streams[accName] = accStreams 3322 hasResponded := sa.responded 3323 js.mu.Unlock() 3324 3325 acc, err := s.LookupAccount(accName) 3326 if err != nil { 3327 ll := fmt.Sprintf("Account [%s] lookup for stream create failed: %v", accName, err) 3328 if isMember { 3329 if !hasResponded { 3330 // If we can not lookup the account and we are a member, send this result back to the metacontroller leader. 3331 result := &streamAssignmentResult{ 3332 Account: accName, 3333 Stream: stream, 3334 Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}, 3335 } 3336 result.Response.Error = NewJSNoAccountError() 3337 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result) 3338 } 3339 s.Warnf(ll) 3340 } else { 3341 s.Debugf(ll) 3342 } 3343 return false 3344 } 3345 3346 var didRemove bool 3347 3348 // Check if this is for us.. 3349 if isMember { 3350 js.processClusterCreateStream(acc, sa) 3351 } else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { 3352 // We have one here even though we are not a member. This can happen on re-assignment. 3353 s.removeStream(ourID, mset, sa) 3354 } 3355 3356 // If this stream assignment does not have a sync subject (bug) set that the meta-leader should check when elected. 3357 if sa.Sync == _EMPTY_ { 3358 js.mu.Lock() 3359 cc.streamsCheck = true 3360 js.mu.Unlock() 3361 return false 3362 } 3363 3364 return didRemove 3365 } 3366 3367 // processUpdateStreamAssignment is called when followers have replicated an updated assignment. 3368 func (js *jetStream) processUpdateStreamAssignment(sa *streamAssignment) { 3369 js.mu.RLock() 3370 s, cc := js.srv, js.cluster 3371 js.mu.RUnlock() 3372 if s == nil || cc == nil { 3373 // TODO(dlc) - debug at least 3374 return 3375 } 3376 3377 accName := sa.Client.serviceAccount() 3378 stream := sa.Config.Name 3379 3380 js.mu.Lock() 3381 if cc.meta == nil { 3382 js.mu.Unlock() 3383 return 3384 } 3385 ourID := cc.meta.ID() 3386 3387 var isMember bool 3388 if sa.Group != nil { 3389 isMember = sa.Group.isMember(ourID) 3390 } 3391 3392 accStreams := cc.streams[accName] 3393 if accStreams == nil { 3394 js.mu.Unlock() 3395 return 3396 } 3397 osa := accStreams[stream] 3398 if osa == nil { 3399 js.mu.Unlock() 3400 return 3401 } 3402 3403 // Copy over private existing state from former SA. 3404 if sa.Group != nil { 3405 sa.Group.node = osa.Group.node 3406 } 3407 sa.consumers = osa.consumers 3408 sa.err = osa.err 3409 3410 // If we detect we are scaling down to 1, non-clustered, and we had a previous node, clear it here. 3411 if sa.Config.Replicas == 1 && sa.Group.node != nil { 3412 sa.Group.node = nil 3413 } 3414 3415 // Update our state. 3416 accStreams[stream] = sa 3417 cc.streams[accName] = accStreams 3418 3419 // Make sure we respond if we are a member. 3420 if isMember { 3421 sa.responded = false 3422 } else { 3423 // Make sure to clean up any old node in case this stream moves back here. 3424 if sa.Group != nil { 3425 sa.Group.node = nil 3426 } 3427 } 3428 js.mu.Unlock() 3429 3430 acc, err := s.LookupAccount(accName) 3431 if err != nil { 3432 s.Warnf("Update Stream Account %s, error on lookup: %v", accName, err) 3433 return 3434 } 3435 3436 // Check if this is for us.. 3437 if isMember { 3438 js.processClusterUpdateStream(acc, osa, sa) 3439 } else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { 3440 // We have one here even though we are not a member. This can happen on re-assignment. 3441 s.removeStream(ourID, mset, sa) 3442 } 3443 } 3444 3445 // Common function to remove ourself from this server. 3446 // This can happen on re-assignment, move, etc 3447 func (s *Server) removeStream(ourID string, mset *stream, nsa *streamAssignment) { 3448 if mset == nil { 3449 return 3450 } 3451 // Make sure to use the new stream assignment, not our own. 3452 s.Debugf("JetStream removing stream '%s > %s' from this server", nsa.Client.serviceAccount(), nsa.Config.Name) 3453 if node := mset.raftNode(); node != nil { 3454 if node.Leader() { 3455 node.StepDown(nsa.Group.Preferred) 3456 } 3457 node.ProposeRemovePeer(ourID) 3458 // shutdown monitor by shutting down raft. 3459 node.Delete() 3460 } 3461 3462 var isShuttingDown bool 3463 // Make sure this node is no longer attached to our stream assignment. 3464 if js, _ := s.getJetStreamCluster(); js != nil { 3465 js.mu.Lock() 3466 nsa.Group.node = nil 3467 isShuttingDown = js.shuttingDown 3468 js.mu.Unlock() 3469 } 3470 3471 if !isShuttingDown { 3472 // wait for monitor to be shutdown. 3473 mset.monitorWg.Wait() 3474 } 3475 mset.stop(true, false) 3476 } 3477 3478 // processClusterUpdateStream is called when we have a stream assignment that 3479 // has been updated for an existing assignment and we are a member. 3480 func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAssignment) { 3481 if sa == nil { 3482 return 3483 } 3484 3485 js.mu.Lock() 3486 s, rg := js.srv, sa.Group 3487 client, subject, reply := sa.Client, sa.Subject, sa.Reply 3488 alreadyRunning, numReplicas := osa.Group.node != nil, len(rg.Peers) 3489 needsNode := rg.node == nil 3490 storage, cfg := sa.Config.Storage, sa.Config 3491 hasResponded := sa.responded 3492 sa.responded = true 3493 recovering := sa.recovering 3494 js.mu.Unlock() 3495 3496 mset, err := acc.lookupStream(cfg.Name) 3497 if err == nil && mset != nil { 3498 // Make sure we have not had a new group assigned to us. 3499 if osa.Group.Name != sa.Group.Name { 3500 s.Warnf("JetStream cluster detected stream remapping for '%s > %s' from %q to %q", 3501 acc, cfg.Name, osa.Group.Name, sa.Group.Name) 3502 mset.removeNode() 3503 alreadyRunning, needsNode = false, true 3504 // Make sure to clear from original. 3505 js.mu.Lock() 3506 osa.Group.node = nil 3507 js.mu.Unlock() 3508 } 3509 3510 var needsSetLeader bool 3511 if !alreadyRunning && numReplicas > 1 { 3512 if needsNode { 3513 mset.setLeader(false) 3514 js.createRaftGroup(acc.GetName(), rg, storage, pprofLabels{ 3515 "type": "stream", 3516 "account": mset.accName(), 3517 "stream": mset.name(), 3518 }) 3519 } 3520 mset.monitorWg.Add(1) 3521 // Start monitoring.. 3522 s.startGoRoutine( 3523 func() { js.monitorStream(mset, sa, needsNode) }, 3524 pprofLabels{ 3525 "type": "stream", 3526 "account": mset.accName(), 3527 "stream": mset.name(), 3528 }, 3529 ) 3530 } else if numReplicas == 1 && alreadyRunning { 3531 // We downgraded to R1. Make sure we cleanup the raft node and the stream monitor. 3532 mset.removeNode() 3533 // Make sure we are leader now that we are R1. 3534 needsSetLeader = true 3535 // In case we need to shutdown the cluster specific subs, etc. 3536 mset.setLeader(false) 3537 js.mu.Lock() 3538 rg.node = nil 3539 js.mu.Unlock() 3540 } 3541 // Call update. 3542 if err = mset.updateWithAdvisory(cfg, !recovering); err != nil { 3543 s.Warnf("JetStream cluster error updating stream %q for account %q: %v", cfg.Name, acc.Name, err) 3544 } 3545 // Set the new stream assignment. 3546 mset.setStreamAssignment(sa) 3547 // Make sure we are the leader now that we are R1. 3548 if needsSetLeader { 3549 mset.setLeader(true) 3550 } 3551 } 3552 3553 // If not found we must be expanding into this node since if we are here we know we are a member. 3554 if err == ErrJetStreamStreamNotFound { 3555 js.processStreamAssignment(sa) 3556 return 3557 } 3558 3559 if err != nil { 3560 js.mu.Lock() 3561 sa.err = err 3562 result := &streamAssignmentResult{ 3563 Account: sa.Client.serviceAccount(), 3564 Stream: sa.Config.Name, 3565 Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}, 3566 Update: true, 3567 } 3568 result.Response.Error = NewJSStreamGeneralError(err, Unless(err)) 3569 js.mu.Unlock() 3570 3571 // Send response to the metadata leader. They will forward to the user as needed. 3572 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result) 3573 return 3574 } 3575 3576 isLeader := mset.IsLeader() 3577 3578 // Check for missing syncSubject bug. 3579 if isLeader && osa != nil && osa.Sync == _EMPTY_ { 3580 if node := mset.raftNode(); node != nil { 3581 node.StepDown() 3582 } 3583 return 3584 } 3585 3586 // If we were a single node being promoted assume leadership role for purpose of responding. 3587 if !hasResponded && !isLeader && !alreadyRunning { 3588 isLeader = true 3589 } 3590 3591 // Check if we should bail. 3592 if !isLeader || hasResponded || recovering { 3593 return 3594 } 3595 3596 // Send our response. 3597 var resp = JSApiStreamUpdateResponse{ApiResponse: ApiResponse{Type: JSApiStreamUpdateResponseType}} 3598 resp.StreamInfo = &StreamInfo{ 3599 Created: mset.createdTime(), 3600 State: mset.state(), 3601 Config: mset.config(), 3602 Cluster: js.clusterInfo(mset.raftGroup()), 3603 Mirror: mset.mirrorInfo(), 3604 Sources: mset.sourcesInfo(), 3605 TimeStamp: time.Now().UTC(), 3606 } 3607 3608 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 3609 } 3610 3611 // processClusterCreateStream is called when we have a stream assignment that 3612 // has been committed and this server is a member of the peer group. 3613 func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignment) { 3614 if sa == nil { 3615 return 3616 } 3617 3618 js.mu.RLock() 3619 s, rg := js.srv, sa.Group 3620 alreadyRunning := rg.node != nil 3621 storage := sa.Config.Storage 3622 restore := sa.Restore 3623 js.mu.RUnlock() 3624 3625 // Process the raft group and make sure it's running if needed. 3626 err := js.createRaftGroup(acc.GetName(), rg, storage, pprofLabels{ 3627 "type": "stream", 3628 "account": acc.Name, 3629 "stream": sa.Config.Name, 3630 }) 3631 3632 // If we are restoring, create the stream if we are R>1 and not the preferred who handles the 3633 // receipt of the snapshot itself. 3634 shouldCreate := true 3635 if restore != nil { 3636 if len(rg.Peers) == 1 || rg.node != nil && rg.node.ID() == rg.Preferred { 3637 shouldCreate = false 3638 } else { 3639 js.mu.Lock() 3640 sa.Restore = nil 3641 js.mu.Unlock() 3642 } 3643 } 3644 3645 // Our stream. 3646 var mset *stream 3647 3648 // Process here if not restoring or not the leader. 3649 if shouldCreate && err == nil { 3650 // Go ahead and create or update the stream. 3651 mset, err = acc.lookupStream(sa.Config.Name) 3652 if err == nil && mset != nil { 3653 osa := mset.streamAssignment() 3654 // If we already have a stream assignment and they are the same exact config, short circuit here. 3655 if osa != nil { 3656 if reflect.DeepEqual(osa.Config, sa.Config) { 3657 if sa.Group.Name == osa.Group.Name && reflect.DeepEqual(sa.Group.Peers, osa.Group.Peers) { 3658 // Since this already exists we know it succeeded, just respond to this caller. 3659 js.mu.RLock() 3660 client, subject, reply, recovering := sa.Client, sa.Subject, sa.Reply, sa.recovering 3661 js.mu.RUnlock() 3662 3663 if !recovering { 3664 var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}} 3665 resp.StreamInfo = &StreamInfo{ 3666 Created: mset.createdTime(), 3667 State: mset.state(), 3668 Config: mset.config(), 3669 Cluster: js.clusterInfo(mset.raftGroup()), 3670 Sources: mset.sourcesInfo(), 3671 Mirror: mset.mirrorInfo(), 3672 TimeStamp: time.Now().UTC(), 3673 } 3674 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 3675 } 3676 return 3677 } else { 3678 // We had a bug where we could have multiple assignments for the same 3679 // stream but with different group assignments, including multiple raft 3680 // groups. So check for that here. We can only bet on the last one being 3681 // consistent in the long run, so let it continue if we see this condition. 3682 s.Warnf("JetStream cluster detected duplicate assignment for stream %q for account %q", sa.Config.Name, acc.Name) 3683 if osa.Group.node != nil && osa.Group.node != sa.Group.node { 3684 osa.Group.node.Delete() 3685 osa.Group.node = nil 3686 } 3687 } 3688 } 3689 } 3690 mset.setStreamAssignment(sa) 3691 // Check if our config has really been updated. 3692 if !reflect.DeepEqual(mset.config(), sa.Config) { 3693 if err = mset.updateWithAdvisory(sa.Config, false); err != nil { 3694 s.Warnf("JetStream cluster error updating stream %q for account %q: %v", sa.Config.Name, acc.Name, err) 3695 if osa != nil { 3696 // Process the raft group and make sure it's running if needed. 3697 js.createRaftGroup(acc.GetName(), osa.Group, storage, pprofLabels{ 3698 "type": "stream", 3699 "account": mset.accName(), 3700 "stream": mset.name(), 3701 }) 3702 mset.setStreamAssignment(osa) 3703 } 3704 if rg.node != nil { 3705 rg.node.Delete() 3706 rg.node = nil 3707 } 3708 } 3709 } 3710 } else if err == NewJSStreamNotFoundError() { 3711 // Add in the stream here. 3712 mset, err = acc.addStreamWithAssignment(sa.Config, nil, sa) 3713 } 3714 if mset != nil { 3715 mset.setCreatedTime(sa.Created) 3716 } 3717 } 3718 3719 // This is an error condition. 3720 if err != nil { 3721 if IsNatsErr(err, JSStreamStoreFailedF) { 3722 s.Warnf("Stream create failed for '%s > %s': %v", sa.Client.serviceAccount(), sa.Config.Name, err) 3723 err = errStreamStoreFailed 3724 } 3725 js.mu.Lock() 3726 3727 sa.err = err 3728 hasResponded := sa.responded 3729 3730 // If out of space do nothing for now. 3731 if isOutOfSpaceErr(err) { 3732 hasResponded = true 3733 } 3734 3735 if rg.node != nil { 3736 rg.node.Delete() 3737 } 3738 3739 var result *streamAssignmentResult 3740 if !hasResponded { 3741 result = &streamAssignmentResult{ 3742 Account: sa.Client.serviceAccount(), 3743 Stream: sa.Config.Name, 3744 Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}, 3745 } 3746 result.Response.Error = NewJSStreamCreateError(err, Unless(err)) 3747 } 3748 js.mu.Unlock() 3749 3750 // Send response to the metadata leader. They will forward to the user as needed. 3751 if result != nil { 3752 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result) 3753 } 3754 return 3755 } 3756 3757 // Re-capture node. 3758 js.mu.RLock() 3759 node := rg.node 3760 js.mu.RUnlock() 3761 3762 // Start our monitoring routine. 3763 if node != nil { 3764 if !alreadyRunning { 3765 if mset != nil { 3766 mset.monitorWg.Add(1) 3767 } 3768 s.startGoRoutine( 3769 func() { js.monitorStream(mset, sa, false) }, 3770 pprofLabels{ 3771 "type": "stream", 3772 "account": mset.accName(), 3773 "stream": mset.name(), 3774 }, 3775 ) 3776 } 3777 } else { 3778 // Single replica stream, process manually here. 3779 // If we are restoring, process that first. 3780 if sa.Restore != nil { 3781 // We are restoring a stream here. 3782 restoreDoneCh := s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_) 3783 s.startGoRoutine(func() { 3784 defer s.grWG.Done() 3785 select { 3786 case err := <-restoreDoneCh: 3787 if err == nil { 3788 mset, err = acc.lookupStream(sa.Config.Name) 3789 if mset != nil { 3790 mset.setStreamAssignment(sa) 3791 mset.setCreatedTime(sa.Created) 3792 } 3793 } 3794 if err != nil { 3795 if mset != nil { 3796 mset.delete() 3797 } 3798 js.mu.Lock() 3799 sa.err = err 3800 result := &streamAssignmentResult{ 3801 Account: sa.Client.serviceAccount(), 3802 Stream: sa.Config.Name, 3803 Restore: &JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}}, 3804 } 3805 result.Restore.Error = NewJSStreamRestoreError(err, Unless(err)) 3806 js.mu.Unlock() 3807 // Send response to the metadata leader. They will forward to the user as needed. 3808 b, _ := json.Marshal(result) // Avoids auto-processing and doing fancy json with newlines. 3809 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, b) 3810 return 3811 } 3812 js.processStreamLeaderChange(mset, true) 3813 3814 // Check to see if we have restored consumers here. 3815 // These are not currently assigned so we will need to do so here. 3816 if consumers := mset.getPublicConsumers(); len(consumers) > 0 { 3817 js.mu.RLock() 3818 cc := js.cluster 3819 js.mu.RUnlock() 3820 3821 for _, o := range consumers { 3822 name, cfg := o.String(), o.config() 3823 rg := cc.createGroupForConsumer(&cfg, sa) 3824 3825 // Place our initial state here as well for assignment distribution. 3826 ca := &consumerAssignment{ 3827 Group: rg, 3828 Stream: sa.Config.Name, 3829 Name: name, 3830 Config: &cfg, 3831 Client: sa.Client, 3832 Created: o.createdTime(), 3833 } 3834 3835 addEntry := encodeAddConsumerAssignment(ca) 3836 cc.meta.ForwardProposal(addEntry) 3837 3838 // Check to make sure we see the assignment. 3839 go func() { 3840 ticker := time.NewTicker(time.Second) 3841 defer ticker.Stop() 3842 for range ticker.C { 3843 js.mu.RLock() 3844 ca, meta := js.consumerAssignment(ca.Client.serviceAccount(), sa.Config.Name, name), cc.meta 3845 js.mu.RUnlock() 3846 if ca == nil { 3847 s.Warnf("Consumer assignment has not been assigned, retrying") 3848 if meta != nil { 3849 meta.ForwardProposal(addEntry) 3850 } else { 3851 return 3852 } 3853 } else { 3854 return 3855 } 3856 } 3857 }() 3858 } 3859 } 3860 case <-s.quitCh: 3861 return 3862 } 3863 }) 3864 } else { 3865 js.processStreamLeaderChange(mset, true) 3866 } 3867 } 3868 } 3869 3870 // processStreamRemoval is called when followers have replicated an assignment. 3871 func (js *jetStream) processStreamRemoval(sa *streamAssignment) { 3872 js.mu.Lock() 3873 s, cc := js.srv, js.cluster 3874 if s == nil || cc == nil || cc.meta == nil { 3875 // TODO(dlc) - debug at least 3876 js.mu.Unlock() 3877 return 3878 } 3879 stream := sa.Config.Name 3880 isMember := sa.Group.isMember(cc.meta.ID()) 3881 wasLeader := cc.isStreamLeader(sa.Client.serviceAccount(), stream) 3882 3883 // Check if we already have this assigned. 3884 accStreams := cc.streams[sa.Client.serviceAccount()] 3885 needDelete := accStreams != nil && accStreams[stream] != nil 3886 if needDelete { 3887 delete(accStreams, stream) 3888 if len(accStreams) == 0 { 3889 delete(cc.streams, sa.Client.serviceAccount()) 3890 } 3891 } 3892 js.mu.Unlock() 3893 3894 if needDelete { 3895 js.processClusterDeleteStream(sa, isMember, wasLeader) 3896 } 3897 } 3898 3899 func (js *jetStream) processClusterDeleteStream(sa *streamAssignment, isMember, wasLeader bool) { 3900 if sa == nil { 3901 return 3902 } 3903 js.mu.RLock() 3904 s := js.srv 3905 node := sa.Group.node 3906 hadLeader := node == nil || node.GroupLeader() != noLeader 3907 offline := s.allPeersOffline(sa.Group) 3908 var isMetaLeader bool 3909 if cc := js.cluster; cc != nil { 3910 isMetaLeader = cc.isLeader() 3911 } 3912 recovering := sa.recovering 3913 js.mu.RUnlock() 3914 3915 stopped := false 3916 var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}} 3917 var err error 3918 var acc *Account 3919 3920 // Go ahead and delete the stream if we have it and the account here. 3921 if acc, _ = s.LookupAccount(sa.Client.serviceAccount()); acc != nil { 3922 if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { 3923 // shut down monitor by shutting down raft 3924 if n := mset.raftNode(); n != nil { 3925 n.Delete() 3926 } 3927 // wait for monitor to be shut down 3928 mset.monitorWg.Wait() 3929 err = mset.stop(true, wasLeader) 3930 stopped = true 3931 } else if isMember { 3932 s.Warnf("JetStream failed to lookup running stream while removing stream '%s > %s' from this server", 3933 sa.Client.serviceAccount(), sa.Config.Name) 3934 } 3935 } else if isMember { 3936 s.Warnf("JetStream failed to lookup account while removing stream '%s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name) 3937 } 3938 3939 // Always delete the node if present. 3940 if node != nil { 3941 node.Delete() 3942 } 3943 3944 // This is a stop gap cleanup in case 3945 // 1) the account does not exist (and mset couldn't be stopped) and/or 3946 // 2) node was nil (and couldn't be deleted) 3947 if !stopped || node == nil { 3948 if sacc := s.SystemAccount(); sacc != nil { 3949 saccName := sacc.GetName() 3950 os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, sa.Group.Name)) 3951 // cleanup dependent consumer groups 3952 if !stopped { 3953 for _, ca := range sa.consumers { 3954 // Make sure we cleanup any possible running nodes for the consumers. 3955 if isMember && ca.Group != nil && ca.Group.node != nil { 3956 ca.Group.node.Delete() 3957 } 3958 os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, ca.Group.Name)) 3959 } 3960 } 3961 } 3962 } 3963 accDir := filepath.Join(js.config.StoreDir, sa.Client.serviceAccount()) 3964 streamDir := filepath.Join(accDir, streamsDir) 3965 os.RemoveAll(filepath.Join(streamDir, sa.Config.Name)) 3966 3967 // no op if not empty 3968 os.Remove(streamDir) 3969 os.Remove(accDir) 3970 3971 // Normally we want only the leader to respond here, but if we had no leader then all members will respond to make 3972 // sure we get feedback to the user. 3973 if !isMember || (hadLeader && !wasLeader) { 3974 // If all the peers are offline and we are the meta leader we will also respond, so suppress returning here. 3975 if !(offline && isMetaLeader) { 3976 return 3977 } 3978 } 3979 3980 // Do not respond if the account does not exist any longer 3981 if acc == nil || recovering { 3982 return 3983 } 3984 3985 if err != nil { 3986 resp.Error = NewJSStreamGeneralError(err, Unless(err)) 3987 s.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp)) 3988 } else { 3989 resp.Success = true 3990 s.sendAPIResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp)) 3991 } 3992 } 3993 3994 // processConsumerAssignment is called when followers have replicated an assignment for a consumer. 3995 func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { 3996 js.mu.RLock() 3997 s, cc := js.srv, js.cluster 3998 accName, stream, consumerName := ca.Client.serviceAccount(), ca.Stream, ca.Name 3999 noMeta := cc == nil || cc.meta == nil 4000 shuttingDown := js.shuttingDown 4001 var ourID string 4002 if !noMeta { 4003 ourID = cc.meta.ID() 4004 } 4005 var isMember bool 4006 if ca.Group != nil && ourID != _EMPTY_ { 4007 isMember = ca.Group.isMember(ourID) 4008 } 4009 js.mu.RUnlock() 4010 4011 if s == nil || noMeta || shuttingDown { 4012 return 4013 } 4014 4015 sa := js.streamAssignment(accName, stream) 4016 if sa == nil { 4017 s.Debugf("Consumer create failed, could not locate stream '%s > %s'", accName, stream) 4018 return 4019 } 4020 4021 // Might need this below. 4022 numReplicas := sa.Config.Replicas 4023 4024 // Track if this existed already. 4025 var wasExisting bool 4026 4027 // Check if we have an existing consumer assignment. 4028 js.mu.Lock() 4029 if sa.consumers == nil { 4030 sa.consumers = make(map[string]*consumerAssignment) 4031 } else if oca := sa.consumers[ca.Name]; oca != nil { 4032 wasExisting = true 4033 // Copy over private existing state from former SA. 4034 if ca.Group != nil { 4035 ca.Group.node = oca.Group.node 4036 } 4037 ca.responded = oca.responded 4038 ca.err = oca.err 4039 } 4040 4041 // Capture the optional state. We will pass it along if we are a member to apply. 4042 // This is only applicable when restoring a stream with consumers. 4043 state := ca.State 4044 ca.State = nil 4045 4046 // Place into our internal map under the stream assignment. 4047 // Ok to replace an existing one, we check on process call below. 4048 sa.consumers[ca.Name] = ca 4049 js.mu.Unlock() 4050 4051 acc, err := s.LookupAccount(accName) 4052 if err != nil { 4053 ll := fmt.Sprintf("Account [%s] lookup for consumer create failed: %v", accName, err) 4054 if isMember { 4055 if !js.isMetaRecovering() { 4056 // If we can not lookup the account and we are a member, send this result back to the metacontroller leader. 4057 result := &consumerAssignmentResult{ 4058 Account: accName, 4059 Stream: stream, 4060 Consumer: consumerName, 4061 Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, 4062 } 4063 result.Response.Error = NewJSNoAccountError() 4064 s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result) 4065 } 4066 s.Warnf(ll) 4067 } else { 4068 s.Debugf(ll) 4069 } 4070 return 4071 } 4072 4073 // Check if this is for us.. 4074 if isMember { 4075 js.processClusterCreateConsumer(ca, state, wasExisting) 4076 } else { 4077 // We need to be removed here, we are no longer assigned. 4078 // Grab consumer if we have it. 4079 var o *consumer 4080 if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { 4081 o = mset.lookupConsumer(ca.Name) 4082 } 4083 4084 // Check if we have a raft node running, meaning we are no longer part of the group but were. 4085 js.mu.Lock() 4086 if node := ca.Group.node; node != nil { 4087 // We have one here even though we are not a member. This can happen on re-assignment. 4088 s.Debugf("JetStream removing consumer '%s > %s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name, ca.Name) 4089 if node.Leader() { 4090 s.Debugf("JetStream consumer '%s > %s > %s' is being removed and was the leader, will perform stepdown", 4091 sa.Client.serviceAccount(), sa.Config.Name, ca.Name) 4092 4093 peers, cn := node.Peers(), s.cachedClusterName() 4094 migrating := numReplicas != len(peers) 4095 4096 // Select a new peer to transfer to. If we are a migrating make sure its from the new cluster. 4097 var npeer string 4098 for _, r := range peers { 4099 if !r.Current { 4100 continue 4101 } 4102 if !migrating { 4103 npeer = r.ID 4104 break 4105 } else if sir, ok := s.nodeToInfo.Load(r.ID); ok && sir != nil { 4106 si := sir.(nodeInfo) 4107 if si.cluster != cn { 4108 npeer = r.ID 4109 break 4110 } 4111 } 4112 } 4113 // Clear the raftnode from our consumer so that a subsequent o.delete will not also issue a stepdown. 4114 if o != nil { 4115 o.clearRaftNode() 4116 } 4117 // Manually handle the stepdown and deletion of the node. 4118 node.UpdateKnownPeers(ca.Group.Peers) 4119 node.StepDown(npeer) 4120 node.Delete() 4121 } else { 4122 node.UpdateKnownPeers(ca.Group.Peers) 4123 } 4124 } 4125 // Always clear the old node. 4126 ca.Group.node = nil 4127 ca.err = nil 4128 js.mu.Unlock() 4129 4130 if o != nil { 4131 o.deleteWithoutAdvisory() 4132 } 4133 } 4134 } 4135 4136 func (js *jetStream) processConsumerRemoval(ca *consumerAssignment) { 4137 js.mu.Lock() 4138 s, cc := js.srv, js.cluster 4139 if s == nil || cc == nil || cc.meta == nil { 4140 // TODO(dlc) - debug at least 4141 js.mu.Unlock() 4142 return 4143 } 4144 isMember := ca.Group.isMember(cc.meta.ID()) 4145 wasLeader := cc.isConsumerLeader(ca.Client.serviceAccount(), ca.Stream, ca.Name) 4146 4147 // Delete from our state. 4148 var needDelete bool 4149 if accStreams := cc.streams[ca.Client.serviceAccount()]; accStreams != nil { 4150 if sa := accStreams[ca.Stream]; sa != nil && sa.consumers != nil && sa.consumers[ca.Name] != nil { 4151 oca := sa.consumers[ca.Name] 4152 // Make sure this removal is for what we have, otherwise ignore. 4153 if ca.Group != nil && oca.Group != nil && ca.Group.Name == oca.Group.Name { 4154 needDelete = true 4155 oca.deleted = true 4156 delete(sa.consumers, ca.Name) 4157 } 4158 } 4159 } 4160 js.mu.Unlock() 4161 4162 if needDelete { 4163 js.processClusterDeleteConsumer(ca, isMember, wasLeader) 4164 } 4165 } 4166 4167 type consumerAssignmentResult struct { 4168 Account string `json:"account"` 4169 Stream string `json:"stream"` 4170 Consumer string `json:"consumer"` 4171 Response *JSApiConsumerCreateResponse `json:"response,omitempty"` 4172 } 4173 4174 // processClusterCreateConsumer is when we are a member of the group and need to create the consumer. 4175 func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state *ConsumerState, wasExisting bool) { 4176 if ca == nil { 4177 return 4178 } 4179 js.mu.RLock() 4180 s := js.srv 4181 rg := ca.Group 4182 alreadyRunning := rg != nil && rg.node != nil 4183 accName, stream, consumer := ca.Client.serviceAccount(), ca.Stream, ca.Name 4184 js.mu.RUnlock() 4185 4186 acc, err := s.LookupAccount(accName) 4187 if err != nil { 4188 s.Warnf("JetStream cluster failed to lookup axccount %q: %v", accName, err) 4189 return 4190 } 4191 4192 // Go ahead and create or update the consumer. 4193 mset, err := acc.lookupStream(stream) 4194 if err != nil { 4195 if !js.isMetaRecovering() { 4196 js.mu.Lock() 4197 s.Warnf("Consumer create failed, could not locate stream '%s > %s > %s'", ca.Client.serviceAccount(), ca.Stream, ca.Name) 4198 ca.err = NewJSStreamNotFoundError() 4199 result := &consumerAssignmentResult{ 4200 Account: ca.Client.serviceAccount(), 4201 Stream: ca.Stream, 4202 Consumer: ca.Name, 4203 Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, 4204 } 4205 result.Response.Error = NewJSStreamNotFoundError() 4206 s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result) 4207 js.mu.Unlock() 4208 } 4209 return 4210 } 4211 4212 // Check if we already have this consumer running. 4213 o := mset.lookupConsumer(consumer) 4214 4215 if !alreadyRunning { 4216 // Process the raft group and make sure its running if needed. 4217 storage := mset.config().Storage 4218 if ca.Config.MemoryStorage { 4219 storage = MemoryStorage 4220 } 4221 // No-op if R1. 4222 js.createRaftGroup(accName, rg, storage, pprofLabels{ 4223 "type": "consumer", 4224 "account": mset.accName(), 4225 "stream": ca.Stream, 4226 "consumer": ca.Name, 4227 }) 4228 } else { 4229 // If we are clustered update the known peers. 4230 js.mu.RLock() 4231 if node := rg.node; node != nil { 4232 node.UpdateKnownPeers(ca.Group.Peers) 4233 } 4234 js.mu.RUnlock() 4235 } 4236 4237 // Check if we already have this consumer running. 4238 var didCreate, isConfigUpdate, needsLocalResponse bool 4239 if o == nil { 4240 // Add in the consumer if needed. 4241 if o, err = mset.addConsumerWithAssignment(ca.Config, ca.Name, ca, wasExisting, ActionCreateOrUpdate); err == nil { 4242 didCreate = true 4243 } 4244 } else { 4245 // This consumer exists. 4246 // Only update if config is really different. 4247 cfg := o.config() 4248 if isConfigUpdate = !reflect.DeepEqual(&cfg, ca.Config); isConfigUpdate { 4249 // Call into update, ignore consumer exists error here since this means an old deliver subject is bound 4250 // which can happen on restart etc. 4251 if err := o.updateConfig(ca.Config); err != nil && err != NewJSConsumerNameExistError() { 4252 // This is essentially an update that has failed. Respond back to metaleader if we are not recovering. 4253 js.mu.RLock() 4254 if !js.metaRecovering { 4255 result := &consumerAssignmentResult{ 4256 Account: accName, 4257 Stream: stream, 4258 Consumer: consumer, 4259 Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, 4260 } 4261 result.Response.Error = NewJSConsumerNameExistError() 4262 s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result) 4263 } 4264 s.Warnf("Consumer create failed during update for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err) 4265 js.mu.RUnlock() 4266 return 4267 } 4268 } 4269 4270 var sendState bool 4271 js.mu.RLock() 4272 n := rg.node 4273 // Check if we already had a consumer assignment and its still pending. 4274 cca, oca := ca, o.consumerAssignment() 4275 if oca != nil { 4276 if !oca.responded { 4277 // We can't override info for replying here otherwise leader once elected can not respond. 4278 // So copy over original client and the reply from the old ca. 4279 cac := *ca 4280 cac.Client = oca.Client 4281 cac.Reply = oca.Reply 4282 cca = &cac 4283 needsLocalResponse = true 4284 } 4285 // If we look like we are scaling up, let's send our current state to the group. 4286 sendState = len(ca.Group.Peers) > len(oca.Group.Peers) && o.IsLeader() && n != nil 4287 // Signal that this is an update 4288 if ca.Reply != _EMPTY_ { 4289 isConfigUpdate = true 4290 } 4291 } 4292 js.mu.RUnlock() 4293 4294 if sendState { 4295 if snap, err := o.store.EncodedState(); err == nil { 4296 n.SendSnapshot(snap) 4297 } 4298 } 4299 4300 // Set CA for our consumer. 4301 o.setConsumerAssignment(cca) 4302 s.Debugf("JetStream cluster, consumer '%s > %s > %s' was already running", ca.Client.serviceAccount(), ca.Stream, ca.Name) 4303 } 4304 4305 // If we have an initial state set apply that now. 4306 if state != nil && o != nil { 4307 o.mu.Lock() 4308 err = o.setStoreState(state) 4309 o.mu.Unlock() 4310 } 4311 4312 if err != nil { 4313 if IsNatsErr(err, JSConsumerStoreFailedErrF) { 4314 s.Warnf("Consumer create failed for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err) 4315 err = errConsumerStoreFailed 4316 } 4317 4318 js.mu.Lock() 4319 4320 ca.err = err 4321 hasResponded := ca.responded 4322 4323 // If out of space do nothing for now. 4324 if isOutOfSpaceErr(err) { 4325 hasResponded = true 4326 } 4327 4328 if rg.node != nil { 4329 rg.node.Delete() 4330 // Clear the node here. 4331 rg.node = nil 4332 } 4333 4334 // If we did seem to create a consumer make sure to stop it. 4335 if o != nil { 4336 o.stop() 4337 } 4338 4339 var result *consumerAssignmentResult 4340 if !hasResponded && !js.metaRecovering { 4341 result = &consumerAssignmentResult{ 4342 Account: ca.Client.serviceAccount(), 4343 Stream: ca.Stream, 4344 Consumer: ca.Name, 4345 Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, 4346 } 4347 result.Response.Error = NewJSConsumerCreateError(err, Unless(err)) 4348 } else if err == errNoInterest { 4349 // This is a stranded ephemeral, let's clean this one up. 4350 subject := fmt.Sprintf(JSApiConsumerDeleteT, ca.Stream, ca.Name) 4351 mset.outq.send(newJSPubMsg(subject, _EMPTY_, _EMPTY_, nil, nil, nil, 0)) 4352 } 4353 js.mu.Unlock() 4354 4355 if result != nil { 4356 // Send response to the metadata leader. They will forward to the user as needed. 4357 b, _ := json.Marshal(result) // Avoids auto-processing and doing fancy json with newlines. 4358 s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, b) 4359 } 4360 } else { 4361 if didCreate { 4362 o.setCreatedTime(ca.Created) 4363 } else { 4364 // Check for scale down to 1.. 4365 if rg.node != nil && len(rg.Peers) == 1 { 4366 o.clearNode() 4367 o.setLeader(true) 4368 // Need to clear from rg too. 4369 js.mu.Lock() 4370 rg.node = nil 4371 client, subject, reply := ca.Client, ca.Subject, ca.Reply 4372 js.mu.Unlock() 4373 var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} 4374 resp.ConsumerInfo = o.info() 4375 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 4376 return 4377 } 4378 } 4379 4380 if rg.node == nil { 4381 // Single replica consumer, process manually here. 4382 js.mu.Lock() 4383 // Force response in case we think this is an update. 4384 if !js.metaRecovering && isConfigUpdate { 4385 ca.responded = false 4386 } 4387 js.mu.Unlock() 4388 js.processConsumerLeaderChange(o, true) 4389 } else { 4390 // Clustered consumer. 4391 // Start our monitoring routine if needed. 4392 if !alreadyRunning && o.shouldStartMonitor() { 4393 s.startGoRoutine( 4394 func() { js.monitorConsumer(o, ca) }, 4395 pprofLabels{ 4396 "type": "consumer", 4397 "account": mset.accName(), 4398 "stream": mset.name(), 4399 "consumer": ca.Name, 4400 }, 4401 ) 4402 } 4403 // For existing consumer, only send response if not recovering. 4404 if wasExisting && !js.isMetaRecovering() { 4405 if o.IsLeader() || (!didCreate && needsLocalResponse) { 4406 // Process if existing as an update. Double check that this is not recovered. 4407 js.mu.RLock() 4408 client, subject, reply, recovering := ca.Client, ca.Subject, ca.Reply, ca.recovering 4409 js.mu.RUnlock() 4410 if !recovering { 4411 var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} 4412 resp.ConsumerInfo = o.info() 4413 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 4414 } 4415 } 4416 } 4417 } 4418 } 4419 } 4420 4421 func (js *jetStream) processClusterDeleteConsumer(ca *consumerAssignment, isMember, wasLeader bool) { 4422 if ca == nil { 4423 return 4424 } 4425 js.mu.RLock() 4426 s := js.srv 4427 node := ca.Group.node 4428 offline := s.allPeersOffline(ca.Group) 4429 var isMetaLeader bool 4430 if cc := js.cluster; cc != nil { 4431 isMetaLeader = cc.isLeader() 4432 } 4433 recovering := ca.recovering 4434 js.mu.RUnlock() 4435 4436 var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}} 4437 var err error 4438 var acc *Account 4439 4440 // Go ahead and delete the consumer if we have it and the account. 4441 if acc, _ = s.LookupAccount(ca.Client.serviceAccount()); acc != nil { 4442 if mset, _ := acc.lookupStream(ca.Stream); mset != nil { 4443 if o := mset.lookupConsumer(ca.Name); o != nil { 4444 err = o.stopWithFlags(true, false, true, wasLeader) 4445 } 4446 } 4447 } else if ca.Group != nil { 4448 // We have a missing account, see if we can cleanup. 4449 if sacc := s.SystemAccount(); sacc != nil { 4450 os.RemoveAll(filepath.Join(js.config.StoreDir, sacc.GetName(), defaultStoreDirName, ca.Group.Name)) 4451 } 4452 } 4453 4454 // Always delete the node if present. 4455 if node != nil { 4456 node.Delete() 4457 } 4458 4459 if !wasLeader || ca.Reply == _EMPTY_ { 4460 if !(offline && isMetaLeader) { 4461 return 4462 } 4463 } 4464 4465 // Do not respond if the account does not exist any longer or this is during recovery. 4466 if acc == nil || recovering { 4467 return 4468 } 4469 4470 if err != nil { 4471 resp.Error = NewJSStreamNotFoundError(Unless(err)) 4472 s.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp)) 4473 } else { 4474 resp.Success = true 4475 s.sendAPIResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp)) 4476 } 4477 } 4478 4479 // Returns the consumer assignment, or nil if not present. 4480 // Lock should be held. 4481 func (js *jetStream) consumerAssignment(account, stream, consumer string) *consumerAssignment { 4482 if sa := js.streamAssignment(account, stream); sa != nil { 4483 return sa.consumers[consumer] 4484 } 4485 return nil 4486 } 4487 4488 // consumerAssigned informs us if this server has this consumer assigned. 4489 func (jsa *jsAccount) consumerAssigned(stream, consumer string) bool { 4490 jsa.mu.RLock() 4491 js, acc := jsa.js, jsa.account 4492 jsa.mu.RUnlock() 4493 4494 if js == nil { 4495 return false 4496 } 4497 js.mu.RLock() 4498 defer js.mu.RUnlock() 4499 return js.cluster.isConsumerAssigned(acc, stream, consumer) 4500 } 4501 4502 // Read lock should be held. 4503 func (cc *jetStreamCluster) isConsumerAssigned(a *Account, stream, consumer string) bool { 4504 // Non-clustered mode always return true. 4505 if cc == nil { 4506 return true 4507 } 4508 if cc.meta == nil { 4509 return false 4510 } 4511 var sa *streamAssignment 4512 accStreams := cc.streams[a.Name] 4513 if accStreams != nil { 4514 sa = accStreams[stream] 4515 } 4516 if sa == nil { 4517 // TODO(dlc) - This should not happen. 4518 return false 4519 } 4520 ca := sa.consumers[consumer] 4521 if ca == nil { 4522 return false 4523 } 4524 rg := ca.Group 4525 // Check if we are the leader of this raftGroup assigned to the stream. 4526 ourID := cc.meta.ID() 4527 for _, peer := range rg.Peers { 4528 if peer == ourID { 4529 return true 4530 } 4531 } 4532 return false 4533 } 4534 4535 // Returns our stream and underlying raft node. 4536 func (o *consumer) streamAndNode() (*stream, RaftNode) { 4537 if o == nil { 4538 return nil, nil 4539 } 4540 o.mu.RLock() 4541 defer o.mu.RUnlock() 4542 return o.mset, o.node 4543 } 4544 4545 // Return the replica count for this consumer. If the consumer has been 4546 // stopped, this will return an error. 4547 func (o *consumer) replica() (int, error) { 4548 o.mu.RLock() 4549 oCfg := o.cfg 4550 mset := o.mset 4551 o.mu.RUnlock() 4552 if mset == nil { 4553 return 0, errBadConsumer 4554 } 4555 sCfg := mset.config() 4556 return oCfg.replicas(&sCfg), nil 4557 } 4558 4559 func (o *consumer) raftGroup() *raftGroup { 4560 if o == nil { 4561 return nil 4562 } 4563 o.mu.RLock() 4564 defer o.mu.RUnlock() 4565 if o.ca == nil { 4566 return nil 4567 } 4568 return o.ca.Group 4569 } 4570 4571 func (o *consumer) clearRaftNode() { 4572 if o == nil { 4573 return 4574 } 4575 o.mu.Lock() 4576 defer o.mu.Unlock() 4577 o.node = nil 4578 } 4579 4580 func (o *consumer) raftNode() RaftNode { 4581 if o == nil { 4582 return nil 4583 } 4584 o.mu.RLock() 4585 defer o.mu.RUnlock() 4586 return o.node 4587 } 4588 4589 func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { 4590 s, n, cc := js.server(), o.raftNode(), js.cluster 4591 defer s.grWG.Done() 4592 4593 defer o.clearMonitorRunning() 4594 4595 if n == nil { 4596 s.Warnf("No RAFT group for '%s > %s > %s'", o.acc.Name, ca.Stream, ca.Name) 4597 return 4598 } 4599 4600 // Make sure to stop the raft group on exit to prevent accidental memory bloat. 4601 // This should be below the checkInMonitor call though to avoid stopping it out 4602 // from underneath the one that is running since it will be the same raft node. 4603 defer n.Stop() 4604 4605 qch, lch, aq, uch, ourPeerId := n.QuitC(), n.LeadChangeC(), n.ApplyQ(), o.updateC(), cc.meta.ID() 4606 4607 s.Debugf("Starting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group()) 4608 defer s.Debugf("Exiting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group()) 4609 4610 const ( 4611 compactInterval = 2 * time.Minute 4612 compactSizeMin = 64 * 1024 // What is stored here is always small for consumers. 4613 compactNumMin = 1024 4614 minSnapDelta = 10 * time.Second 4615 ) 4616 4617 // Spread these out for large numbers on server restart. 4618 rci := time.Duration(rand.Int63n(int64(time.Minute))) 4619 t := time.NewTicker(compactInterval + rci) 4620 defer t.Stop() 4621 4622 // Highwayhash key for generating hashes. 4623 key := make([]byte, 32) 4624 crand.Read(key) 4625 4626 // Hash of the last snapshot (fixed size in memory). 4627 var lastSnap []byte 4628 var lastSnapTime time.Time 4629 4630 // Don't allow the upper layer to install snapshots until we have 4631 // fully recovered from disk. 4632 recovering := true 4633 4634 doSnapshot := func(force bool) { 4635 // Bail if trying too fast and not in a forced situation. 4636 if recovering || (!force && time.Since(lastSnapTime) < minSnapDelta) { 4637 return 4638 } 4639 4640 // Check several things to see if we need a snapshot. 4641 ne, nb := n.Size() 4642 if !n.NeedSnapshot() { 4643 // Check if we should compact etc. based on size of log. 4644 if !force && ne < compactNumMin && nb < compactSizeMin { 4645 return 4646 } 4647 } 4648 4649 if snap, err := o.store.EncodedState(); err == nil { 4650 hash := highwayhash.Sum(snap, key) 4651 // If the state hasn't changed but the log has gone way over 4652 // the compaction size then we will want to compact anyway. 4653 // This can happen for example when a pull consumer fetches a 4654 // lot on an idle stream, log entries get distributed but the 4655 // state never changes, therefore the log never gets compacted. 4656 if !bytes.Equal(hash[:], lastSnap) || ne >= compactNumMin || nb >= compactSizeMin { 4657 if err := n.InstallSnapshot(snap); err == nil { 4658 lastSnap, lastSnapTime = hash[:], time.Now() 4659 } else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning { 4660 s.RateLimitWarnf("Failed to install snapshot for '%s > %s > %s' [%s]: %v", o.acc.Name, ca.Stream, ca.Name, n.Group(), err) 4661 } 4662 } 4663 } 4664 } 4665 4666 // For migration tracking. 4667 var mmt *time.Ticker 4668 var mmtc <-chan time.Time 4669 4670 startMigrationMonitoring := func() { 4671 if mmt == nil { 4672 mmt = time.NewTicker(500 * time.Millisecond) 4673 mmtc = mmt.C 4674 } 4675 } 4676 4677 stopMigrationMonitoring := func() { 4678 if mmt != nil { 4679 mmt.Stop() 4680 mmt, mmtc = nil, nil 4681 } 4682 } 4683 defer stopMigrationMonitoring() 4684 4685 // Track if we are leader. 4686 var isLeader bool 4687 4688 for { 4689 select { 4690 case <-s.quitCh: 4691 return 4692 case <-qch: 4693 return 4694 case <-aq.ch: 4695 ces := aq.pop() 4696 for _, ce := range ces { 4697 // No special processing needed for when we are caught up on restart. 4698 if ce == nil { 4699 recovering = false 4700 if n.NeedSnapshot() { 4701 doSnapshot(true) 4702 } 4703 // Check our state if we are under an interest based stream. 4704 o.checkStateForInterestStream() 4705 } else if err := js.applyConsumerEntries(o, ce, isLeader); err == nil { 4706 ne, nb := n.Applied(ce.Index) 4707 ce.ReturnToPool() 4708 // If we have at least min entries to compact, go ahead and snapshot/compact. 4709 if nb > 0 && ne >= compactNumMin || nb > compactSizeMin { 4710 doSnapshot(false) 4711 } 4712 } else { 4713 s.Warnf("Error applying consumer entries to '%s > %s'", ca.Client.serviceAccount(), ca.Name) 4714 } 4715 } 4716 aq.recycle(&ces) 4717 case isLeader = <-lch: 4718 if recovering && !isLeader { 4719 js.setConsumerAssignmentRecovering(ca) 4720 } 4721 4722 // Process the change. 4723 if err := js.processConsumerLeaderChange(o, isLeader); err == nil && isLeader { 4724 doSnapshot(true) 4725 } 4726 4727 // We may receive a leader change after the consumer assignment which would cancel us 4728 // monitoring for this closely. So re-assess our state here as well. 4729 // Or the old leader is no longer part of the set and transferred leadership 4730 // for this leader to resume with removal 4731 rg := o.raftGroup() 4732 4733 // Check for migrations (peer count and replica count differ) here. 4734 // We set the state on the stream assignment update below. 4735 replicas, err := o.replica() 4736 if err != nil { 4737 continue 4738 } 4739 if isLeader && len(rg.Peers) != replicas { 4740 startMigrationMonitoring() 4741 } else { 4742 stopMigrationMonitoring() 4743 } 4744 case <-uch: 4745 // keep consumer assignment current 4746 ca = o.consumerAssignment() 4747 // We get this when we have a new consumer assignment caused by an update. 4748 // We want to know if we are migrating. 4749 rg := o.raftGroup() 4750 // keep peer list up to date with config 4751 js.checkPeers(rg) 4752 // If we are migrating, monitor for the new peers to be caught up. 4753 replicas, err := o.replica() 4754 if err != nil { 4755 continue 4756 } 4757 if isLeader && len(rg.Peers) != replicas { 4758 startMigrationMonitoring() 4759 } else { 4760 stopMigrationMonitoring() 4761 } 4762 case <-mmtc: 4763 if !isLeader { 4764 // We are no longer leader, so not our job. 4765 stopMigrationMonitoring() 4766 continue 4767 } 4768 rg := o.raftGroup() 4769 ci := js.clusterInfo(rg) 4770 replicas, err := o.replica() 4771 if err != nil { 4772 continue 4773 } 4774 if len(rg.Peers) <= replicas { 4775 // Migration no longer happening, so not our job anymore 4776 stopMigrationMonitoring() 4777 continue 4778 } 4779 newPeers, oldPeers, newPeerSet, _ := genPeerInfo(rg.Peers, len(rg.Peers)-replicas) 4780 4781 // If we are part of the new peerset and we have been passed the baton. 4782 // We will handle scale down. 4783 if newPeerSet[ourPeerId] { 4784 for _, p := range oldPeers { 4785 n.ProposeRemovePeer(p) 4786 } 4787 cca := ca.copyGroup() 4788 cca.Group.Peers = newPeers 4789 cca.Group.Cluster = s.cachedClusterName() 4790 cc.meta.ForwardProposal(encodeAddConsumerAssignment(cca)) 4791 s.Noticef("Scaling down '%s > %s > %s' to %+v", ca.Client.serviceAccount(), ca.Stream, ca.Name, s.peerSetToNames(newPeers)) 4792 4793 } else { 4794 var newLeaderPeer, newLeader, newCluster string 4795 neededCurrent, current := replicas/2+1, 0 4796 for _, r := range ci.Replicas { 4797 if r.Current && newPeerSet[r.Peer] { 4798 current++ 4799 if newCluster == _EMPTY_ { 4800 newLeaderPeer, newLeader, newCluster = r.Peer, r.Name, r.cluster 4801 } 4802 } 4803 } 4804 4805 // Check if we have a quorom 4806 if current >= neededCurrent { 4807 s.Noticef("Transfer of consumer leader for '%s > %s > %s' to '%s'", ca.Client.serviceAccount(), ca.Stream, ca.Name, newLeader) 4808 n.StepDown(newLeaderPeer) 4809 } 4810 } 4811 4812 case <-t.C: 4813 doSnapshot(false) 4814 } 4815 } 4816 } 4817 4818 func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLeader bool) error { 4819 for _, e := range ce.Entries { 4820 if e.Type == EntrySnapshot { 4821 if !isLeader { 4822 // No-op needed? 4823 state, err := decodeConsumerState(e.Data) 4824 if err != nil { 4825 if mset, node := o.streamAndNode(); mset != nil && node != nil { 4826 s := js.srv 4827 s.Errorf("JetStream cluster could not decode consumer snapshot for '%s > %s > %s' [%s]", 4828 mset.account(), mset.name(), o, node.Group()) 4829 } 4830 panic(err.Error()) 4831 } 4832 if err = o.store.Update(state); err != nil { 4833 o.mu.RLock() 4834 s, acc, mset, name := o.srv, o.acc, o.mset, o.name 4835 o.mu.RUnlock() 4836 if s != nil && mset != nil { 4837 s.Warnf("Consumer '%s > %s > %s' error on store update from snapshot entry: %v", acc, mset.name(), name, err) 4838 } 4839 } else { 4840 o.checkStateForInterestStream() 4841 } 4842 } 4843 4844 } else if e.Type == EntryRemovePeer { 4845 js.mu.RLock() 4846 var ourID string 4847 if js.cluster != nil && js.cluster.meta != nil { 4848 ourID = js.cluster.meta.ID() 4849 } 4850 js.mu.RUnlock() 4851 if peer := string(e.Data); peer == ourID { 4852 shouldRemove := true 4853 if mset := o.getStream(); mset != nil { 4854 if sa := mset.streamAssignment(); sa != nil && sa.Group != nil { 4855 js.mu.RLock() 4856 shouldRemove = !sa.Group.isMember(ourID) 4857 js.mu.RUnlock() 4858 } 4859 } 4860 if shouldRemove { 4861 o.stopWithFlags(true, false, false, false) 4862 } 4863 } 4864 return nil 4865 } else if e.Type == EntryAddPeer { 4866 // Ignore for now. 4867 } else { 4868 buf := e.Data 4869 switch entryOp(buf[0]) { 4870 case updateDeliveredOp: 4871 // These are handled in place in leaders. 4872 if !isLeader { 4873 dseq, sseq, dc, ts, err := decodeDeliveredUpdate(buf[1:]) 4874 if err != nil { 4875 if mset, node := o.streamAndNode(); mset != nil && node != nil { 4876 s := js.srv 4877 s.Errorf("JetStream cluster could not decode consumer delivered update for '%s > %s > %s' [%s]", 4878 mset.account(), mset.name(), o, node.Group()) 4879 } 4880 panic(err.Error()) 4881 } 4882 // Make sure to update delivered under the lock. 4883 o.mu.Lock() 4884 err = o.store.UpdateDelivered(dseq, sseq, dc, ts) 4885 o.ldt = time.Now() 4886 o.mu.Unlock() 4887 if err != nil { 4888 panic(err.Error()) 4889 } 4890 } 4891 case updateAcksOp: 4892 dseq, sseq, err := decodeAckUpdate(buf[1:]) 4893 if err != nil { 4894 if mset, node := o.streamAndNode(); mset != nil && node != nil { 4895 s := js.srv 4896 s.Errorf("JetStream cluster could not decode consumer ack update for '%s > %s > %s' [%s]", 4897 mset.account(), mset.name(), o, node.Group()) 4898 } 4899 panic(err.Error()) 4900 } 4901 o.processReplicatedAck(dseq, sseq) 4902 case updateSkipOp: 4903 o.mu.Lock() 4904 if !o.isLeader() { 4905 var le = binary.LittleEndian 4906 if sseq := le.Uint64(buf[1:]); sseq > o.sseq { 4907 o.sseq = sseq 4908 } 4909 } 4910 o.mu.Unlock() 4911 case addPendingRequest: 4912 o.mu.Lock() 4913 if !o.isLeader() { 4914 if o.prm == nil { 4915 o.prm = make(map[string]struct{}) 4916 } 4917 o.prm[string(buf[1:])] = struct{}{} 4918 } 4919 o.mu.Unlock() 4920 case removePendingRequest: 4921 o.mu.Lock() 4922 if !o.isLeader() { 4923 if o.prm != nil { 4924 delete(o.prm, string(buf[1:])) 4925 } 4926 } 4927 o.mu.Unlock() 4928 default: 4929 panic(fmt.Sprintf("JetStream Cluster Unknown group entry op type: %v", entryOp(buf[0]))) 4930 } 4931 } 4932 } 4933 return nil 4934 } 4935 4936 func (o *consumer) processReplicatedAck(dseq, sseq uint64) { 4937 o.mu.Lock() 4938 4939 mset := o.mset 4940 if o.closed || mset == nil { 4941 o.mu.Unlock() 4942 return 4943 } 4944 4945 // Update activity. 4946 o.lat = time.Now() 4947 4948 // Do actual ack update to store. 4949 o.store.UpdateAcks(dseq, sseq) 4950 4951 if o.retention == LimitsPolicy { 4952 o.mu.Unlock() 4953 return 4954 } 4955 4956 var sagap uint64 4957 if o.cfg.AckPolicy == AckAll { 4958 if o.isLeader() { 4959 sagap = sseq - o.asflr 4960 } else { 4961 // We are a follower so only have the store state, so read that in. 4962 state, err := o.store.State() 4963 if err != nil { 4964 o.mu.Unlock() 4965 return 4966 } 4967 sagap = sseq - state.AckFloor.Stream 4968 } 4969 } 4970 o.mu.Unlock() 4971 4972 if sagap > 1 { 4973 // FIXME(dlc) - This is very inefficient, will need to fix. 4974 for seq := sseq; seq > sseq-sagap; seq-- { 4975 mset.ackMsg(o, seq) 4976 } 4977 } else { 4978 mset.ackMsg(o, sseq) 4979 } 4980 } 4981 4982 var errBadAckUpdate = errors.New("jetstream cluster bad replicated ack update") 4983 var errBadDeliveredUpdate = errors.New("jetstream cluster bad replicated delivered update") 4984 4985 func decodeAckUpdate(buf []byte) (dseq, sseq uint64, err error) { 4986 var bi, n int 4987 if dseq, n = binary.Uvarint(buf); n < 0 { 4988 return 0, 0, errBadAckUpdate 4989 } 4990 bi += n 4991 if sseq, n = binary.Uvarint(buf[bi:]); n < 0 { 4992 return 0, 0, errBadAckUpdate 4993 } 4994 return dseq, sseq, nil 4995 } 4996 4997 func decodeDeliveredUpdate(buf []byte) (dseq, sseq, dc uint64, ts int64, err error) { 4998 var bi, n int 4999 if dseq, n = binary.Uvarint(buf); n < 0 { 5000 return 0, 0, 0, 0, errBadDeliveredUpdate 5001 } 5002 bi += n 5003 if sseq, n = binary.Uvarint(buf[bi:]); n < 0 { 5004 return 0, 0, 0, 0, errBadDeliveredUpdate 5005 } 5006 bi += n 5007 if dc, n = binary.Uvarint(buf[bi:]); n < 0 { 5008 return 0, 0, 0, 0, errBadDeliveredUpdate 5009 } 5010 bi += n 5011 if ts, n = binary.Varint(buf[bi:]); n < 0 { 5012 return 0, 0, 0, 0, errBadDeliveredUpdate 5013 } 5014 return dseq, sseq, dc, ts, nil 5015 } 5016 5017 func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) error { 5018 stepDownIfLeader := func() error { 5019 if node := o.raftNode(); node != nil && isLeader { 5020 node.StepDown() 5021 } 5022 return errors.New("failed to update consumer leader status") 5023 } 5024 5025 if o == nil || o.isClosed() { 5026 return stepDownIfLeader() 5027 } 5028 5029 ca := o.consumerAssignment() 5030 if ca == nil { 5031 return stepDownIfLeader() 5032 } 5033 js.mu.Lock() 5034 s, account, err := js.srv, ca.Client.serviceAccount(), ca.err 5035 client, subject, reply, streamName, consumerName := ca.Client, ca.Subject, ca.Reply, ca.Stream, ca.Name 5036 hasResponded := ca.responded 5037 ca.responded = true 5038 js.mu.Unlock() 5039 5040 acc, _ := s.LookupAccount(account) 5041 if acc == nil { 5042 return stepDownIfLeader() 5043 } 5044 5045 if isLeader { 5046 s.Noticef("JetStream cluster new consumer leader for '%s > %s > %s'", ca.Client.serviceAccount(), streamName, consumerName) 5047 s.sendConsumerLeaderElectAdvisory(o) 5048 // Check for peer removal and process here if needed. 5049 js.checkPeers(ca.Group) 5050 } else { 5051 // We are stepping down. 5052 // Make sure if we are doing so because we have lost quorum that we send the appropriate advisories. 5053 if node := o.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second { 5054 s.sendConsumerLostQuorumAdvisory(o) 5055 } 5056 } 5057 5058 // Tell consumer to switch leader status. 5059 o.setLeader(isLeader) 5060 5061 if !isLeader || hasResponded { 5062 if isLeader { 5063 o.clearInitialInfo() 5064 } 5065 return nil 5066 } 5067 5068 var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} 5069 if err != nil { 5070 resp.Error = NewJSConsumerCreateError(err, Unless(err)) 5071 s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 5072 } else { 5073 resp.ConsumerInfo = o.initialInfo() 5074 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 5075 if node := o.raftNode(); node != nil { 5076 o.sendCreateAdvisory() 5077 } 5078 } 5079 5080 // Only send a pause advisory on consumer create if we're 5081 // actually paused. The timer would have been kicked by now 5082 // by the call to o.setLeader() above. 5083 if isLeader && o.cfg.PauseUntil != nil && !o.cfg.PauseUntil.IsZero() && time.Now().Before(*o.cfg.PauseUntil) { 5084 o.sendPauseAdvisoryLocked(&o.cfg) 5085 } 5086 5087 return nil 5088 } 5089 5090 // Determines if we should send lost quorum advisory. We throttle these after first one. 5091 func (o *consumer) shouldSendLostQuorum() bool { 5092 o.mu.Lock() 5093 defer o.mu.Unlock() 5094 if time.Since(o.lqsent) >= lostQuorumAdvInterval { 5095 o.lqsent = time.Now() 5096 return true 5097 } 5098 return false 5099 } 5100 5101 func (s *Server) sendConsumerLostQuorumAdvisory(o *consumer) { 5102 if o == nil { 5103 return 5104 } 5105 node, stream, consumer, acc := o.raftNode(), o.streamName(), o.String(), o.account() 5106 if node == nil { 5107 return 5108 } 5109 if !o.shouldSendLostQuorum() { 5110 return 5111 } 5112 5113 s.Warnf("JetStream cluster consumer '%s > %s > %s' has NO quorum, stalled.", acc.GetName(), stream, consumer) 5114 5115 subj := JSAdvisoryConsumerQuorumLostPre + "." + stream + "." + consumer 5116 adv := &JSConsumerQuorumLostAdvisory{ 5117 TypedEvent: TypedEvent{ 5118 Type: JSConsumerQuorumLostAdvisoryType, 5119 ID: nuid.Next(), 5120 Time: time.Now().UTC(), 5121 }, 5122 Stream: stream, 5123 Consumer: consumer, 5124 Replicas: s.replicas(node), 5125 Domain: s.getOpts().JetStreamDomain, 5126 } 5127 5128 // Send to the user's account if not the system account. 5129 if acc != s.SystemAccount() { 5130 s.publishAdvisory(acc, subj, adv) 5131 } 5132 // Now do system level one. Place account info in adv, and nil account means system. 5133 adv.Account = acc.GetName() 5134 s.publishAdvisory(nil, subj, adv) 5135 } 5136 5137 func (s *Server) sendConsumerLeaderElectAdvisory(o *consumer) { 5138 if o == nil { 5139 return 5140 } 5141 node, stream, consumer, acc := o.raftNode(), o.streamName(), o.String(), o.account() 5142 if node == nil { 5143 return 5144 } 5145 5146 subj := JSAdvisoryConsumerLeaderElectedPre + "." + stream + "." + consumer 5147 adv := &JSConsumerLeaderElectedAdvisory{ 5148 TypedEvent: TypedEvent{ 5149 Type: JSConsumerLeaderElectedAdvisoryType, 5150 ID: nuid.Next(), 5151 Time: time.Now().UTC(), 5152 }, 5153 Stream: stream, 5154 Consumer: consumer, 5155 Leader: s.serverNameForNode(node.GroupLeader()), 5156 Replicas: s.replicas(node), 5157 Domain: s.getOpts().JetStreamDomain, 5158 } 5159 5160 // Send to the user's account if not the system account. 5161 if acc != s.SystemAccount() { 5162 s.publishAdvisory(acc, subj, adv) 5163 } 5164 // Now do system level one. Place account info in adv, and nil account means system. 5165 adv.Account = acc.GetName() 5166 s.publishAdvisory(nil, subj, adv) 5167 } 5168 5169 type streamAssignmentResult struct { 5170 Account string `json:"account"` 5171 Stream string `json:"stream"` 5172 Response *JSApiStreamCreateResponse `json:"create_response,omitempty"` 5173 Restore *JSApiStreamRestoreResponse `json:"restore_response,omitempty"` 5174 Update bool `json:"is_update,omitempty"` 5175 } 5176 5177 // Determine if this is an insufficient resources' error type. 5178 func isInsufficientResourcesErr(resp *JSApiStreamCreateResponse) bool { 5179 return resp != nil && resp.Error != nil && IsNatsErr(resp.Error, JSInsufficientResourcesErr, JSMemoryResourcesExceededErr, JSStorageResourcesExceededErr) 5180 } 5181 5182 // Process error results of stream and consumer assignments. 5183 // Success will be handled by stream leader. 5184 func (js *jetStream) processStreamAssignmentResults(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 5185 var result streamAssignmentResult 5186 if err := json.Unmarshal(msg, &result); err != nil { 5187 // TODO(dlc) - log 5188 return 5189 } 5190 acc, _ := js.srv.LookupAccount(result.Account) 5191 if acc == nil { 5192 // TODO(dlc) - log 5193 return 5194 } 5195 5196 js.mu.Lock() 5197 defer js.mu.Unlock() 5198 5199 s, cc := js.srv, js.cluster 5200 if cc == nil || cc.meta == nil { 5201 return 5202 } 5203 5204 // This should have been done already in processStreamAssignment, but in 5205 // case we have a code path that gets here with no processStreamAssignment, 5206 // then we will do the proper thing. Otherwise will be a no-op. 5207 cc.removeInflightProposal(result.Account, result.Stream) 5208 5209 // FIXME(dlc) - suppress duplicates? 5210 if sa := js.streamAssignment(result.Account, result.Stream); sa != nil { 5211 canDelete := !result.Update && time.Since(sa.Created) < 5*time.Second 5212 5213 // See if we should retry in case this cluster is full but there are others. 5214 if cfg, ci := sa.Config, sa.Client; cfg != nil && ci != nil && isInsufficientResourcesErr(result.Response) && canDelete { 5215 // If cluster is defined we can not retry. 5216 if cfg.Placement == nil || cfg.Placement.Cluster == _EMPTY_ { 5217 // If we have additional clusters to try we can retry. 5218 if ci != nil && len(ci.Alternates) > 0 { 5219 if rg, err := js.createGroupForStream(ci, cfg); err != nil { 5220 s.Warnf("Retrying cluster placement for stream '%s > %s' failed due to placement error: %+v", result.Account, result.Stream, err) 5221 } else { 5222 if org := sa.Group; org != nil && len(org.Peers) > 0 { 5223 s.Warnf("Retrying cluster placement for stream '%s > %s' due to insufficient resources in cluster %q", 5224 result.Account, result.Stream, s.clusterNameForNode(org.Peers[0])) 5225 } else { 5226 s.Warnf("Retrying cluster placement for stream '%s > %s' due to insufficient resources", result.Account, result.Stream) 5227 } 5228 // Pick a new preferred leader. 5229 rg.setPreferred() 5230 // Get rid of previous attempt. 5231 cc.meta.Propose(encodeDeleteStreamAssignment(sa)) 5232 // Propose new. 5233 sa.Group, sa.err = rg, nil 5234 cc.meta.Propose(encodeAddStreamAssignment(sa)) 5235 return 5236 } 5237 } 5238 } 5239 } 5240 5241 // Respond to the user here. 5242 var resp string 5243 if result.Response != nil { 5244 resp = s.jsonResponse(result.Response) 5245 } else if result.Restore != nil { 5246 resp = s.jsonResponse(result.Restore) 5247 } 5248 if !sa.responded || result.Update { 5249 sa.responded = true 5250 js.srv.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, resp) 5251 } 5252 // Remove this assignment if possible. 5253 if canDelete { 5254 sa.err = NewJSClusterNotAssignedError() 5255 cc.meta.Propose(encodeDeleteStreamAssignment(sa)) 5256 } 5257 } 5258 } 5259 5260 func (js *jetStream) processConsumerAssignmentResults(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 5261 var result consumerAssignmentResult 5262 if err := json.Unmarshal(msg, &result); err != nil { 5263 // TODO(dlc) - log 5264 return 5265 } 5266 acc, _ := js.srv.LookupAccount(result.Account) 5267 if acc == nil { 5268 // TODO(dlc) - log 5269 return 5270 } 5271 5272 js.mu.Lock() 5273 defer js.mu.Unlock() 5274 5275 s, cc := js.srv, js.cluster 5276 if cc == nil || cc.meta == nil { 5277 return 5278 } 5279 5280 if sa := js.streamAssignment(result.Account, result.Stream); sa != nil && sa.consumers != nil { 5281 if ca := sa.consumers[result.Consumer]; ca != nil && !ca.responded { 5282 js.srv.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(result.Response)) 5283 ca.responded = true 5284 5285 // Check if this failed. 5286 // TODO(dlc) - Could have mixed results, should track per peer. 5287 // Make sure this is recent response, do not delete existing consumers. 5288 if result.Response.Error != nil && result.Response.Error != NewJSConsumerNameExistError() && time.Since(ca.Created) < 2*time.Second { 5289 // So while we are deleting we will not respond to list/names requests. 5290 ca.err = NewJSClusterNotAssignedError() 5291 cc.meta.Propose(encodeDeleteConsumerAssignment(ca)) 5292 s.Warnf("Proposing to delete consumer `%s > %s > %s' due to assignment response error: %v", 5293 result.Account, result.Stream, result.Consumer, result.Response.Error) 5294 } 5295 } 5296 } 5297 } 5298 5299 const ( 5300 streamAssignmentSubj = "$SYS.JSC.STREAM.ASSIGNMENT.RESULT" 5301 consumerAssignmentSubj = "$SYS.JSC.CONSUMER.ASSIGNMENT.RESULT" 5302 ) 5303 5304 // Lock should be held. 5305 func (js *jetStream) startUpdatesSub() { 5306 cc, s, c := js.cluster, js.srv, js.cluster.c 5307 if cc.streamResults == nil { 5308 cc.streamResults, _ = s.systemSubscribe(streamAssignmentSubj, _EMPTY_, false, c, js.processStreamAssignmentResults) 5309 } 5310 if cc.consumerResults == nil { 5311 cc.consumerResults, _ = s.systemSubscribe(consumerAssignmentSubj, _EMPTY_, false, c, js.processConsumerAssignmentResults) 5312 } 5313 if cc.stepdown == nil { 5314 cc.stepdown, _ = s.systemSubscribe(JSApiLeaderStepDown, _EMPTY_, false, c, s.jsLeaderStepDownRequest) 5315 } 5316 if cc.peerRemove == nil { 5317 cc.peerRemove, _ = s.systemSubscribe(JSApiRemoveServer, _EMPTY_, false, c, s.jsLeaderServerRemoveRequest) 5318 } 5319 if cc.peerStreamMove == nil { 5320 cc.peerStreamMove, _ = s.systemSubscribe(JSApiServerStreamMove, _EMPTY_, false, c, s.jsLeaderServerStreamMoveRequest) 5321 } 5322 if cc.peerStreamCancelMove == nil { 5323 cc.peerStreamCancelMove, _ = s.systemSubscribe(JSApiServerStreamCancelMove, _EMPTY_, false, c, s.jsLeaderServerStreamCancelMoveRequest) 5324 } 5325 if js.accountPurge == nil { 5326 js.accountPurge, _ = s.systemSubscribe(JSApiAccountPurge, _EMPTY_, false, c, s.jsLeaderAccountPurgeRequest) 5327 } 5328 } 5329 5330 // Lock should be held. 5331 func (js *jetStream) stopUpdatesSub() { 5332 cc := js.cluster 5333 if cc.streamResults != nil { 5334 cc.s.sysUnsubscribe(cc.streamResults) 5335 cc.streamResults = nil 5336 } 5337 if cc.consumerResults != nil { 5338 cc.s.sysUnsubscribe(cc.consumerResults) 5339 cc.consumerResults = nil 5340 } 5341 if cc.stepdown != nil { 5342 cc.s.sysUnsubscribe(cc.stepdown) 5343 cc.stepdown = nil 5344 } 5345 if cc.peerRemove != nil { 5346 cc.s.sysUnsubscribe(cc.peerRemove) 5347 cc.peerRemove = nil 5348 } 5349 if cc.peerStreamMove != nil { 5350 cc.s.sysUnsubscribe(cc.peerStreamMove) 5351 cc.peerStreamMove = nil 5352 } 5353 if cc.peerStreamCancelMove != nil { 5354 cc.s.sysUnsubscribe(cc.peerStreamCancelMove) 5355 cc.peerStreamCancelMove = nil 5356 } 5357 if js.accountPurge != nil { 5358 cc.s.sysUnsubscribe(js.accountPurge) 5359 js.accountPurge = nil 5360 } 5361 } 5362 5363 func (s *Server) sendDomainLeaderElectAdvisory() { 5364 js, cc := s.getJetStreamCluster() 5365 if js == nil || cc == nil { 5366 return 5367 } 5368 5369 js.mu.RLock() 5370 node := cc.meta 5371 js.mu.RUnlock() 5372 5373 adv := &JSDomainLeaderElectedAdvisory{ 5374 TypedEvent: TypedEvent{ 5375 Type: JSDomainLeaderElectedAdvisoryType, 5376 ID: nuid.Next(), 5377 Time: time.Now().UTC(), 5378 }, 5379 Leader: node.GroupLeader(), 5380 Replicas: s.replicas(node), 5381 Cluster: s.cachedClusterName(), 5382 Domain: s.getOpts().JetStreamDomain, 5383 } 5384 5385 s.publishAdvisory(nil, JSAdvisoryDomainLeaderElected, adv) 5386 } 5387 5388 func (js *jetStream) processLeaderChange(isLeader bool) { 5389 if js == nil { 5390 return 5391 } 5392 s := js.srv 5393 if s == nil { 5394 return 5395 } 5396 // Update our server atomic. 5397 s.isMetaLeader.Store(isLeader) 5398 5399 if isLeader { 5400 s.Noticef("Self is new JetStream cluster metadata leader") 5401 s.sendDomainLeaderElectAdvisory() 5402 } else { 5403 var node string 5404 if meta := js.getMetaGroup(); meta != nil { 5405 node = meta.GroupLeader() 5406 } 5407 if node == _EMPTY_ { 5408 s.Noticef("JetStream cluster no metadata leader") 5409 } else if srv := js.srv.serverNameForNode(node); srv == _EMPTY_ { 5410 s.Noticef("JetStream cluster new remote metadata leader") 5411 } else if clst := js.srv.clusterNameForNode(node); clst == _EMPTY_ { 5412 s.Noticef("JetStream cluster new metadata leader: %s", srv) 5413 } else { 5414 s.Noticef("JetStream cluster new metadata leader: %s/%s", srv, clst) 5415 } 5416 } 5417 5418 js.mu.Lock() 5419 defer js.mu.Unlock() 5420 5421 if isLeader { 5422 js.startUpdatesSub() 5423 } else { 5424 js.stopUpdatesSub() 5425 // TODO(dlc) - stepdown. 5426 } 5427 5428 // If we have been signaled to check the streams, this is for a bug that left stream 5429 // assignments with no sync subject after an update and no way to sync/catchup outside of the RAFT layer. 5430 if isLeader && js.cluster.streamsCheck { 5431 cc := js.cluster 5432 for acc, asa := range cc.streams { 5433 for _, sa := range asa { 5434 if sa.Sync == _EMPTY_ { 5435 s.Warnf("Stream assigment corrupt for stream '%s > %s'", acc, sa.Config.Name) 5436 nsa := &streamAssignment{Group: sa.Group, Config: sa.Config, Subject: sa.Subject, Reply: sa.Reply, Client: sa.Client} 5437 nsa.Sync = syncSubjForStream() 5438 cc.meta.Propose(encodeUpdateStreamAssignment(nsa)) 5439 } 5440 } 5441 } 5442 // Clear check. 5443 cc.streamsCheck = false 5444 } 5445 } 5446 5447 // Lock should be held. 5448 func (cc *jetStreamCluster) remapStreamAssignment(sa *streamAssignment, removePeer string) bool { 5449 // Invoke placement algo passing RG peers that stay (existing) and the peer that is being removed (ignore) 5450 var retain, ignore []string 5451 for _, v := range sa.Group.Peers { 5452 if v == removePeer { 5453 ignore = append(ignore, v) 5454 } else { 5455 retain = append(retain, v) 5456 } 5457 } 5458 5459 newPeers, placementError := cc.selectPeerGroup(len(sa.Group.Peers), sa.Group.Cluster, sa.Config, retain, 0, ignore) 5460 5461 if placementError == nil { 5462 sa.Group.Peers = newPeers 5463 // Don't influence preferred leader. 5464 sa.Group.Preferred = _EMPTY_ 5465 return true 5466 } 5467 5468 // If R1 just return to avoid bricking the stream. 5469 if sa.Group.node == nil || len(sa.Group.Peers) == 1 { 5470 return false 5471 } 5472 5473 // If we are here let's remove the peer at least, as long as we are R>1 5474 for i, peer := range sa.Group.Peers { 5475 if peer == removePeer { 5476 sa.Group.Peers[i] = sa.Group.Peers[len(sa.Group.Peers)-1] 5477 sa.Group.Peers = sa.Group.Peers[:len(sa.Group.Peers)-1] 5478 break 5479 } 5480 } 5481 return false 5482 } 5483 5484 type selectPeerError struct { 5485 excludeTag bool 5486 offline bool 5487 noStorage bool 5488 uniqueTag bool 5489 misc bool 5490 noJsClust bool 5491 noMatchTags map[string]struct{} 5492 } 5493 5494 func (e *selectPeerError) Error() string { 5495 b := strings.Builder{} 5496 writeBoolErrReason := func(hasErr bool, errMsg string) { 5497 if !hasErr { 5498 return 5499 } 5500 b.WriteString(", ") 5501 b.WriteString(errMsg) 5502 } 5503 b.WriteString("no suitable peers for placement") 5504 writeBoolErrReason(e.offline, "peer offline") 5505 writeBoolErrReason(e.excludeTag, "exclude tag set") 5506 writeBoolErrReason(e.noStorage, "insufficient storage") 5507 writeBoolErrReason(e.uniqueTag, "server tag not unique") 5508 writeBoolErrReason(e.misc, "miscellaneous issue") 5509 writeBoolErrReason(e.noJsClust, "jetstream not enabled in cluster") 5510 if len(e.noMatchTags) != 0 { 5511 b.WriteString(", tags not matched [") 5512 var firstTagWritten bool 5513 for tag := range e.noMatchTags { 5514 if firstTagWritten { 5515 b.WriteString(", ") 5516 } 5517 firstTagWritten = true 5518 b.WriteRune('\'') 5519 b.WriteString(tag) 5520 b.WriteRune('\'') 5521 } 5522 b.WriteString("]") 5523 } 5524 return b.String() 5525 } 5526 5527 func (e *selectPeerError) addMissingTag(t string) { 5528 if e.noMatchTags == nil { 5529 e.noMatchTags = map[string]struct{}{} 5530 } 5531 e.noMatchTags[t] = struct{}{} 5532 } 5533 5534 func (e *selectPeerError) accumulate(eAdd *selectPeerError) { 5535 if eAdd == nil { 5536 return 5537 } 5538 acc := func(val *bool, valAdd bool) { 5539 if valAdd { 5540 *val = valAdd 5541 } 5542 } 5543 acc(&e.offline, eAdd.offline) 5544 acc(&e.excludeTag, eAdd.excludeTag) 5545 acc(&e.noStorage, eAdd.noStorage) 5546 acc(&e.uniqueTag, eAdd.uniqueTag) 5547 acc(&e.misc, eAdd.misc) 5548 acc(&e.noJsClust, eAdd.noJsClust) 5549 for tag := range eAdd.noMatchTags { 5550 e.addMissingTag(tag) 5551 } 5552 } 5553 5554 // selectPeerGroup will select a group of peers to start a raft group. 5555 // when peers exist already the unique tag prefix check for the replaceFirstExisting will be skipped 5556 // js lock should be held. 5557 func (cc *jetStreamCluster) selectPeerGroup(r int, cluster string, cfg *StreamConfig, existing []string, replaceFirstExisting int, ignore []string) ([]string, *selectPeerError) { 5558 if cluster == _EMPTY_ || cfg == nil { 5559 return nil, &selectPeerError{misc: true} 5560 } 5561 5562 var maxBytes uint64 5563 if cfg.MaxBytes > 0 { 5564 maxBytes = uint64(cfg.MaxBytes) 5565 } 5566 5567 // Check for tags. 5568 var tags []string 5569 if cfg.Placement != nil && len(cfg.Placement.Tags) > 0 { 5570 tags = cfg.Placement.Tags 5571 } 5572 5573 // Used for weighted sorting based on availability. 5574 type wn struct { 5575 id string 5576 avail uint64 5577 ha int 5578 ns int 5579 } 5580 5581 var nodes []wn 5582 // peers is a randomized list 5583 s, peers := cc.s, cc.meta.Peers() 5584 5585 uniqueTagPrefix := s.getOpts().JetStreamUniqueTag 5586 if uniqueTagPrefix != _EMPTY_ { 5587 for _, tag := range tags { 5588 if strings.HasPrefix(tag, uniqueTagPrefix) { 5589 // disable uniqueness check if explicitly listed in tags 5590 uniqueTagPrefix = _EMPTY_ 5591 break 5592 } 5593 } 5594 } 5595 var uniqueTags = make(map[string]*nodeInfo) 5596 5597 checkUniqueTag := func(ni *nodeInfo) (bool, *nodeInfo) { 5598 for _, t := range ni.tags { 5599 if strings.HasPrefix(t, uniqueTagPrefix) { 5600 if n, ok := uniqueTags[t]; !ok { 5601 uniqueTags[t] = ni 5602 return true, ni 5603 } else { 5604 return false, n 5605 } 5606 } 5607 } 5608 // default requires the unique prefix to be present 5609 return false, nil 5610 } 5611 5612 // Map existing. 5613 var ep map[string]struct{} 5614 if le := len(existing); le > 0 { 5615 if le >= r { 5616 return existing[:r], nil 5617 } 5618 ep = make(map[string]struct{}) 5619 for i, p := range existing { 5620 ep[p] = struct{}{} 5621 if uniqueTagPrefix == _EMPTY_ { 5622 continue 5623 } 5624 si, ok := s.nodeToInfo.Load(p) 5625 if !ok || si == nil || i < replaceFirstExisting { 5626 continue 5627 } 5628 ni := si.(nodeInfo) 5629 // collect unique tags, but do not require them as this node is already part of the peerset 5630 checkUniqueTag(&ni) 5631 } 5632 } 5633 5634 // Map ignore 5635 var ip map[string]struct{} 5636 if li := len(ignore); li > 0 { 5637 ip = make(map[string]struct{}) 5638 for _, p := range ignore { 5639 ip[p] = struct{}{} 5640 } 5641 } 5642 5643 // Grab the number of streams and HA assets currently assigned to each peer. 5644 // HAAssets under usage is async, so calculate here in realtime based on assignments. 5645 peerStreams := make(map[string]int, len(peers)) 5646 peerHA := make(map[string]int, len(peers)) 5647 for _, asa := range cc.streams { 5648 for _, sa := range asa { 5649 isHA := len(sa.Group.Peers) > 1 5650 for _, peer := range sa.Group.Peers { 5651 peerStreams[peer]++ 5652 if isHA { 5653 peerHA[peer]++ 5654 } 5655 } 5656 } 5657 } 5658 5659 maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets 5660 5661 // An error is a result of multiple individual placement decisions. 5662 // Which is why we keep taps on how often which one happened. 5663 err := selectPeerError{} 5664 5665 // Shuffle them up. 5666 rand.Shuffle(len(peers), func(i, j int) { peers[i], peers[j] = peers[j], peers[i] }) 5667 for _, p := range peers { 5668 si, ok := s.nodeToInfo.Load(p.ID) 5669 if !ok || si == nil { 5670 err.misc = true 5671 continue 5672 } 5673 ni := si.(nodeInfo) 5674 // Only select from the designated named cluster. 5675 if ni.cluster != cluster { 5676 s.Debugf("Peer selection: discard %s@%s reason: not target cluster %s", ni.name, ni.cluster, cluster) 5677 continue 5678 } 5679 5680 // If we know its offline or we do not have config or err don't consider. 5681 if ni.offline || ni.cfg == nil || ni.stats == nil { 5682 s.Debugf("Peer selection: discard %s@%s reason: offline", ni.name, ni.cluster) 5683 err.offline = true 5684 continue 5685 } 5686 5687 // If ignore skip 5688 if _, ok := ip[p.ID]; ok { 5689 continue 5690 } 5691 5692 // If existing also skip, we will add back in to front of the list when done. 5693 if _, ok := ep[p.ID]; ok { 5694 continue 5695 } 5696 5697 if ni.tags.Contains(jsExcludePlacement) { 5698 s.Debugf("Peer selection: discard %s@%s tags: %v reason: %s present", 5699 ni.name, ni.cluster, ni.tags, jsExcludePlacement) 5700 err.excludeTag = true 5701 continue 5702 } 5703 5704 if len(tags) > 0 { 5705 matched := true 5706 for _, t := range tags { 5707 if !ni.tags.Contains(t) { 5708 matched = false 5709 s.Debugf("Peer selection: discard %s@%s tags: %v reason: mandatory tag %s not present", 5710 ni.name, ni.cluster, ni.tags, t) 5711 err.addMissingTag(t) 5712 break 5713 } 5714 } 5715 if !matched { 5716 continue 5717 } 5718 } 5719 5720 var available uint64 5721 if ni.stats != nil { 5722 switch cfg.Storage { 5723 case MemoryStorage: 5724 used := ni.stats.ReservedMemory 5725 if ni.stats.Memory > used { 5726 used = ni.stats.Memory 5727 } 5728 if ni.cfg.MaxMemory > int64(used) { 5729 available = uint64(ni.cfg.MaxMemory) - used 5730 } 5731 case FileStorage: 5732 used := ni.stats.ReservedStore 5733 if ni.stats.Store > used { 5734 used = ni.stats.Store 5735 } 5736 if ni.cfg.MaxStore > int64(used) { 5737 available = uint64(ni.cfg.MaxStore) - used 5738 } 5739 } 5740 } 5741 5742 // Otherwise check if we have enough room if maxBytes set. 5743 if maxBytes > 0 && maxBytes > available { 5744 s.Warnf("Peer selection: discard %s@%s (Max Bytes: %d) exceeds available %s storage of %d bytes", 5745 ni.name, ni.cluster, maxBytes, cfg.Storage.String(), available) 5746 err.noStorage = true 5747 continue 5748 } 5749 // HAAssets contain _meta_ which we want to ignore, hence > and not >=. 5750 if maxHaAssets > 0 && ni.stats != nil && ni.stats.HAAssets > maxHaAssets { 5751 s.Warnf("Peer selection: discard %s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d for stream placement", 5752 ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets) 5753 err.misc = true 5754 continue 5755 } 5756 5757 if uniqueTagPrefix != _EMPTY_ { 5758 if unique, owner := checkUniqueTag(&ni); !unique { 5759 if owner != nil { 5760 s.Debugf("Peer selection: discard %s@%s tags:%v reason: unique prefix %s owned by %s@%s", 5761 ni.name, ni.cluster, ni.tags, owner.name, owner.cluster) 5762 } else { 5763 s.Debugf("Peer selection: discard %s@%s tags:%v reason: unique prefix %s not present", 5764 ni.name, ni.cluster, ni.tags) 5765 } 5766 err.uniqueTag = true 5767 continue 5768 } 5769 } 5770 // Add to our list of potential nodes. 5771 nodes = append(nodes, wn{p.ID, available, peerHA[p.ID], peerStreams[p.ID]}) 5772 } 5773 5774 // If we could not select enough peers, fail. 5775 if len(nodes) < (r - len(existing)) { 5776 s.Debugf("Peer selection: required %d nodes but found %d (cluster: %s replica: %d existing: %v/%d peers: %d result-peers: %d err: %+v)", 5777 (r - len(existing)), len(nodes), cluster, r, existing, replaceFirstExisting, len(peers), len(nodes), err) 5778 if len(peers) == 0 { 5779 err.noJsClust = true 5780 } 5781 return nil, &err 5782 } 5783 // Sort based on available from most to least, breaking ties by number of total streams assigned to the peer. 5784 sort.Slice(nodes, func(i, j int) bool { 5785 if nodes[i].avail == nodes[j].avail { 5786 return nodes[i].ns < nodes[j].ns 5787 } 5788 return nodes[i].avail > nodes[j].avail 5789 }) 5790 // If we are placing a replicated stream, let's sort based on HAAssets, as that is more important to balance. 5791 if cfg.Replicas > 1 { 5792 sort.SliceStable(nodes, func(i, j int) bool { return nodes[i].ha < nodes[j].ha }) 5793 } 5794 5795 var results []string 5796 if len(existing) > 0 { 5797 results = append(results, existing...) 5798 r -= len(existing) 5799 } 5800 for _, r := range nodes[:r] { 5801 results = append(results, r.id) 5802 } 5803 return results, nil 5804 } 5805 5806 func groupNameForStream(peers []string, storage StorageType) string { 5807 return groupName("S", peers, storage) 5808 } 5809 5810 func groupNameForConsumer(peers []string, storage StorageType) string { 5811 return groupName("C", peers, storage) 5812 } 5813 5814 func groupName(prefix string, peers []string, storage StorageType) string { 5815 gns := getHash(nuid.Next()) 5816 return fmt.Sprintf("%s-R%d%s-%s", prefix, len(peers), storage.String()[:1], gns) 5817 } 5818 5819 // returns stream count for this tier as well as applicable reservation size (not including reservations for cfg) 5820 // jetStream read lock should be held 5821 func tieredStreamAndReservationCount(asa map[string]*streamAssignment, tier string, cfg *StreamConfig) (int, int64) { 5822 var numStreams int 5823 var reservation int64 5824 for _, sa := range asa { 5825 if tier == _EMPTY_ || isSameTier(sa.Config, cfg) { 5826 numStreams++ 5827 if sa.Config.MaxBytes > 0 && sa.Config.Storage == cfg.Storage && sa.Config.Name != cfg.Name { 5828 // If tier is empty, all storage is flat and we should adjust for replicas. 5829 // Otherwise if tiered, storage replication already taken into consideration. 5830 if tier == _EMPTY_ && cfg.Replicas > 1 { 5831 reservation += sa.Config.MaxBytes * int64(cfg.Replicas) 5832 } else { 5833 reservation += sa.Config.MaxBytes 5834 } 5835 } 5836 } 5837 } 5838 return numStreams, reservation 5839 } 5840 5841 // createGroupForStream will create a group for assignment for the stream. 5842 // Lock should be held. 5843 func (js *jetStream) createGroupForStream(ci *ClientInfo, cfg *StreamConfig) (*raftGroup, *selectPeerError) { 5844 replicas := cfg.Replicas 5845 if replicas == 0 { 5846 replicas = 1 5847 } 5848 5849 // Default connected cluster from the request origin. 5850 cc, cluster := js.cluster, ci.Cluster 5851 // If specified, override the default. 5852 clusterDefined := cfg.Placement != nil && cfg.Placement.Cluster != _EMPTY_ 5853 if clusterDefined { 5854 cluster = cfg.Placement.Cluster 5855 } 5856 clusters := []string{cluster} 5857 if !clusterDefined { 5858 clusters = append(clusters, ci.Alternates...) 5859 } 5860 5861 // Need to create a group here. 5862 errs := &selectPeerError{} 5863 for _, cn := range clusters { 5864 peers, err := cc.selectPeerGroup(replicas, cn, cfg, nil, 0, nil) 5865 if len(peers) < replicas { 5866 errs.accumulate(err) 5867 continue 5868 } 5869 return &raftGroup{Name: groupNameForStream(peers, cfg.Storage), Storage: cfg.Storage, Peers: peers, Cluster: cn}, nil 5870 } 5871 return nil, errs 5872 } 5873 5874 func (acc *Account) selectLimits(cfg *StreamConfig) (*JetStreamAccountLimits, string, *jsAccount, *ApiError) { 5875 // Grab our jetstream account info. 5876 acc.mu.RLock() 5877 jsa := acc.js 5878 acc.mu.RUnlock() 5879 5880 if jsa == nil { 5881 return nil, _EMPTY_, nil, NewJSNotEnabledForAccountError() 5882 } 5883 5884 jsa.usageMu.RLock() 5885 selectedLimits, tierName, ok := jsa.selectLimits(cfg) 5886 jsa.usageMu.RUnlock() 5887 5888 if !ok { 5889 return nil, _EMPTY_, nil, NewJSNoLimitsError() 5890 } 5891 return &selectedLimits, tierName, jsa, nil 5892 } 5893 5894 // Read lock needs to be held 5895 func (js *jetStream) jsClusteredStreamLimitsCheck(acc *Account, cfg *StreamConfig) *ApiError { 5896 selectedLimits, tier, _, apiErr := acc.selectLimits(cfg) 5897 if apiErr != nil { 5898 return apiErr 5899 } 5900 5901 asa := js.cluster.streams[acc.Name] 5902 numStreams, reservations := tieredStreamAndReservationCount(asa, tier, cfg) 5903 // Check for inflight proposals... 5904 if cc := js.cluster; cc != nil && cc.inflight != nil { 5905 numStreams += len(cc.inflight[acc.Name]) 5906 } 5907 if selectedLimits.MaxStreams > 0 && numStreams >= selectedLimits.MaxStreams { 5908 return NewJSMaximumStreamsLimitError() 5909 } 5910 // Check for account limits here before proposing. 5911 if err := js.checkAccountLimits(selectedLimits, cfg, reservations); err != nil { 5912 return NewJSStreamLimitsError(err, Unless(err)) 5913 } 5914 return nil 5915 } 5916 5917 func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, config *StreamConfig) { 5918 js, cc := s.getJetStreamCluster() 5919 if js == nil || cc == nil { 5920 return 5921 } 5922 5923 var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}} 5924 5925 ccfg, apiErr := s.checkStreamCfg(config, acc) 5926 if apiErr != nil { 5927 resp.Error = apiErr 5928 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 5929 return 5930 } 5931 cfg := &ccfg 5932 5933 // Now process the request and proposal. 5934 js.mu.Lock() 5935 defer js.mu.Unlock() 5936 5937 var self *streamAssignment 5938 var rg *raftGroup 5939 5940 // Capture if we have existing assignment first. 5941 if osa := js.streamAssignment(acc.Name, cfg.Name); osa != nil { 5942 if !reflect.DeepEqual(osa.Config, cfg) { 5943 resp.Error = NewJSStreamNameExistError() 5944 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 5945 return 5946 } 5947 // This is an equal assignment. 5948 self, rg = osa, osa.Group 5949 } 5950 5951 if cfg.Sealed { 5952 resp.Error = NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration for create can not be sealed")) 5953 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 5954 return 5955 } 5956 5957 // Check for subject collisions here. 5958 if cc.subjectsOverlap(acc.Name, cfg.Subjects, self) { 5959 resp.Error = NewJSStreamSubjectOverlapError() 5960 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 5961 return 5962 } 5963 5964 apiErr = js.jsClusteredStreamLimitsCheck(acc, cfg) 5965 // Check for stream limits here before proposing. These need to be tracked from meta layer, not jsa. 5966 if apiErr != nil { 5967 resp.Error = apiErr 5968 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 5969 return 5970 } 5971 5972 // Raft group selection and placement. 5973 if rg == nil { 5974 // Check inflight before proposing in case we have an existing inflight proposal. 5975 if cc.inflight == nil { 5976 cc.inflight = make(map[string]map[string]*raftGroup) 5977 } 5978 streams, ok := cc.inflight[acc.Name] 5979 if !ok { 5980 streams = make(map[string]*raftGroup) 5981 cc.inflight[acc.Name] = streams 5982 } else if existing, ok := streams[cfg.Name]; ok { 5983 // We have existing for same stream. Re-use same group. 5984 rg = existing 5985 } 5986 } 5987 // Create a new one here if needed. 5988 if rg == nil { 5989 nrg, err := js.createGroupForStream(ci, cfg) 5990 if err != nil { 5991 resp.Error = NewJSClusterNoPeersError(err) 5992 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 5993 return 5994 } 5995 rg = nrg 5996 // Pick a preferred leader. 5997 rg.setPreferred() 5998 } 5999 6000 // Sync subject for post snapshot sync. 6001 sa := &streamAssignment{Group: rg, Sync: syncSubjForStream(), Config: cfg, Subject: subject, Reply: reply, Client: ci, Created: time.Now().UTC()} 6002 if err := cc.meta.Propose(encodeAddStreamAssignment(sa)); err == nil { 6003 // On success, add this as an inflight proposal so we can apply limits 6004 // on concurrent create requests while this stream assignment has 6005 // possibly not been processed yet. 6006 if streams, ok := cc.inflight[acc.Name]; ok { 6007 streams[cfg.Name] = rg 6008 } 6009 } 6010 } 6011 6012 var ( 6013 errReqTimeout = errors.New("timeout while waiting for response") 6014 errReqSrvExit = errors.New("server shutdown while waiting for response") 6015 ) 6016 6017 // blocking utility call to perform requests on the system account 6018 // returns (synchronized) v or error 6019 func sysRequest[T any](s *Server, subjFormat string, args ...interface{}) (*T, error) { 6020 isubj := fmt.Sprintf(subjFormat, args...) 6021 6022 s.mu.Lock() 6023 inbox := s.newRespInbox() 6024 results := make(chan *T, 1) 6025 s.sys.replies[inbox] = func(_ *subscription, _ *client, _ *Account, _, _ string, msg []byte) { 6026 var v T 6027 if err := json.Unmarshal(msg, &v); err != nil { 6028 s.Warnf("Error unmarshalling response for request '%s':%v", isubj, err) 6029 return 6030 } 6031 select { 6032 case results <- &v: 6033 default: 6034 s.Warnf("Failed placing request response on internal channel") 6035 } 6036 } 6037 s.mu.Unlock() 6038 6039 s.sendInternalMsgLocked(isubj, inbox, nil, nil) 6040 6041 defer func() { 6042 s.mu.Lock() 6043 defer s.mu.Unlock() 6044 if s.sys != nil && s.sys.replies != nil { 6045 delete(s.sys.replies, inbox) 6046 } 6047 }() 6048 6049 ttl := time.NewTimer(2 * time.Second) 6050 defer ttl.Stop() 6051 6052 select { 6053 case <-s.quitCh: 6054 return nil, errReqSrvExit 6055 case <-ttl.C: 6056 return nil, errReqTimeout 6057 case data := <-results: 6058 return data, nil 6059 } 6060 } 6061 6062 func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, cfg *StreamConfig, peerSet []string) { 6063 js, cc := s.getJetStreamCluster() 6064 if js == nil || cc == nil { 6065 return 6066 } 6067 6068 // Now process the request and proposal. 6069 js.mu.Lock() 6070 defer js.mu.Unlock() 6071 meta := cc.meta 6072 if meta == nil { 6073 return 6074 } 6075 6076 var resp = JSApiStreamUpdateResponse{ApiResponse: ApiResponse{Type: JSApiStreamUpdateResponseType}} 6077 6078 osa := js.streamAssignment(acc.Name, cfg.Name) 6079 6080 if osa == nil { 6081 resp.Error = NewJSStreamNotFoundError() 6082 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6083 return 6084 } 6085 var newCfg *StreamConfig 6086 if jsa := js.accounts[acc.Name]; jsa != nil { 6087 js.mu.Unlock() 6088 ncfg, err := jsa.configUpdateCheck(osa.Config, cfg, s) 6089 js.mu.Lock() 6090 if err != nil { 6091 resp.Error = NewJSStreamUpdateError(err, Unless(err)) 6092 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6093 return 6094 } else { 6095 newCfg = ncfg 6096 } 6097 } else { 6098 resp.Error = NewJSNotEnabledForAccountError() 6099 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6100 return 6101 } 6102 // Check for mirror changes which are not allowed. 6103 if !reflect.DeepEqual(newCfg.Mirror, osa.Config.Mirror) { 6104 resp.Error = NewJSStreamMirrorNotUpdatableError() 6105 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6106 return 6107 } 6108 6109 // Check for subject collisions here. 6110 if cc.subjectsOverlap(acc.Name, cfg.Subjects, osa) { 6111 resp.Error = NewJSStreamSubjectOverlapError() 6112 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6113 return 6114 } 6115 6116 // Make copy so to not change original. 6117 rg := osa.copyGroup().Group 6118 6119 // Check for a move request. 6120 var isMoveRequest, isMoveCancel bool 6121 if lPeerSet := len(peerSet); lPeerSet > 0 { 6122 isMoveRequest = true 6123 // check if this is a cancellation 6124 if lPeerSet == osa.Config.Replicas && lPeerSet <= len(rg.Peers) { 6125 isMoveCancel = true 6126 // can only be a cancellation if the peer sets overlap as expected 6127 for i := 0; i < lPeerSet; i++ { 6128 if peerSet[i] != rg.Peers[i] { 6129 isMoveCancel = false 6130 break 6131 } 6132 } 6133 } 6134 } else { 6135 isMoveRequest = newCfg.Placement != nil && !reflect.DeepEqual(osa.Config.Placement, newCfg.Placement) 6136 } 6137 6138 // Check for replica changes. 6139 isReplicaChange := newCfg.Replicas != osa.Config.Replicas 6140 6141 // We stage consumer updates and do them after the stream update. 6142 var consumers []*consumerAssignment 6143 6144 // Check if this is a move request, but no cancellation, and we are already moving this stream. 6145 if isMoveRequest && !isMoveCancel && osa.Config.Replicas != len(rg.Peers) { 6146 // obtain stats to include in error message 6147 msg := _EMPTY_ 6148 if s.allPeersOffline(rg) { 6149 msg = fmt.Sprintf("all %d peers offline", len(rg.Peers)) 6150 } else { 6151 // Need to release js lock. 6152 js.mu.Unlock() 6153 if si, err := sysRequest[StreamInfo](s, clusterStreamInfoT, ci.serviceAccount(), cfg.Name); err != nil { 6154 msg = fmt.Sprintf("error retrieving info: %s", err.Error()) 6155 } else if si != nil { 6156 currentCount := 0 6157 if si.Cluster.Leader != _EMPTY_ { 6158 currentCount++ 6159 } 6160 combinedLag := uint64(0) 6161 for _, r := range si.Cluster.Replicas { 6162 if r.Current { 6163 currentCount++ 6164 } 6165 combinedLag += r.Lag 6166 } 6167 msg = fmt.Sprintf("total peers: %d, current peers: %d, combined lag: %d", 6168 len(rg.Peers), currentCount, combinedLag) 6169 } 6170 // Re-acquire here. 6171 js.mu.Lock() 6172 } 6173 resp.Error = NewJSStreamMoveInProgressError(msg) 6174 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6175 return 6176 } 6177 6178 // Can not move and scale at same time. 6179 if isMoveRequest && isReplicaChange { 6180 resp.Error = NewJSStreamMoveAndScaleError() 6181 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6182 return 6183 } 6184 6185 if isReplicaChange { 6186 // We are adding new peers here. 6187 if newCfg.Replicas > len(rg.Peers) { 6188 // Check that we have the allocation available. 6189 if err := js.jsClusteredStreamLimitsCheck(acc, newCfg); err != nil { 6190 resp.Error = err 6191 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6192 return 6193 } 6194 // Check if we do not have a cluster assigned, and if we do not make sure we 6195 // try to pick one. This could happen with older streams that were assigned by 6196 // previous servers. 6197 if rg.Cluster == _EMPTY_ { 6198 // Prefer placement directrives if we have them. 6199 if newCfg.Placement != nil && newCfg.Placement.Cluster != _EMPTY_ { 6200 rg.Cluster = newCfg.Placement.Cluster 6201 } else { 6202 // Fall back to the cluster assignment from the client. 6203 rg.Cluster = ci.Cluster 6204 } 6205 } 6206 peers, err := cc.selectPeerGroup(newCfg.Replicas, rg.Cluster, newCfg, rg.Peers, 0, nil) 6207 if err != nil { 6208 resp.Error = NewJSClusterNoPeersError(err) 6209 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6210 return 6211 } 6212 // Single nodes are not recorded by the NRG layer so we can rename. 6213 if len(peers) == 1 { 6214 rg.Name = groupNameForStream(peers, rg.Storage) 6215 } else if len(rg.Peers) == 1 { 6216 // This is scale up from being a singelton, set preferred to that singelton. 6217 rg.Preferred = rg.Peers[0] 6218 } 6219 rg.Peers = peers 6220 } else { 6221 // We are deleting nodes here. We want to do our best to preserve the current leader. 6222 // We have support now from above that guarantees we are in our own Go routine, so can 6223 // ask for stream info from the stream leader to make sure we keep the leader in the new list. 6224 var curLeader string 6225 if !s.allPeersOffline(rg) { 6226 // Need to release js lock. 6227 js.mu.Unlock() 6228 if si, err := sysRequest[StreamInfo](s, clusterStreamInfoT, ci.serviceAccount(), cfg.Name); err != nil { 6229 s.Warnf("Did not receive stream info results for '%s > %s' due to: %s", acc, cfg.Name, err) 6230 } else if si != nil { 6231 if cl := si.Cluster; cl != nil && cl.Leader != _EMPTY_ { 6232 curLeader = getHash(cl.Leader) 6233 } 6234 } 6235 // Re-acquire here. 6236 js.mu.Lock() 6237 } 6238 // If we identified a leader make sure its part of the new group. 6239 selected := make([]string, 0, newCfg.Replicas) 6240 6241 if curLeader != _EMPTY_ { 6242 selected = append(selected, curLeader) 6243 } 6244 for _, peer := range rg.Peers { 6245 if len(selected) == newCfg.Replicas { 6246 break 6247 } 6248 if peer == curLeader { 6249 continue 6250 } 6251 if si, ok := s.nodeToInfo.Load(peer); ok && si != nil { 6252 if si.(nodeInfo).offline { 6253 continue 6254 } 6255 selected = append(selected, peer) 6256 } 6257 } 6258 rg.Peers = selected 6259 } 6260 6261 // Need to remap any consumers. 6262 for _, ca := range osa.consumers { 6263 // Ephemerals are R=1, so only auto-remap durables, or R>1, unless stream is interest or workqueue policy. 6264 numPeers := len(ca.Group.Peers) 6265 if ca.Config.Durable != _EMPTY_ || numPeers > 1 || cfg.Retention != LimitsPolicy { 6266 cca := ca.copyGroup() 6267 // Adjust preferred as needed. 6268 if numPeers == 1 && len(rg.Peers) > 1 { 6269 cca.Group.Preferred = ca.Group.Peers[0] 6270 } else { 6271 cca.Group.Preferred = _EMPTY_ 6272 } 6273 // Assign new peers. 6274 cca.Group.Peers = rg.Peers 6275 // We can not propose here before the stream itself so we collect them. 6276 consumers = append(consumers, cca) 6277 } 6278 } 6279 } else if isMoveRequest { 6280 if len(peerSet) == 0 { 6281 nrg, err := js.createGroupForStream(ci, newCfg) 6282 if err != nil { 6283 resp.Error = NewJSClusterNoPeersError(err) 6284 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6285 return 6286 } 6287 // filter peers present in both sets 6288 for _, peer := range rg.Peers { 6289 found := false 6290 for _, newPeer := range nrg.Peers { 6291 if peer == newPeer { 6292 found = true 6293 break 6294 } 6295 } 6296 if !found { 6297 peerSet = append(peerSet, peer) 6298 } 6299 } 6300 peerSet = append(peerSet, nrg.Peers...) 6301 } 6302 if len(rg.Peers) == 1 { 6303 rg.Preferred = peerSet[0] 6304 } 6305 rg.Peers = peerSet 6306 6307 for _, ca := range osa.consumers { 6308 cca := ca.copyGroup() 6309 r := cca.Config.replicas(osa.Config) 6310 // shuffle part of cluster peer set we will be keeping 6311 randPeerSet := copyStrings(peerSet[len(peerSet)-newCfg.Replicas:]) 6312 rand.Shuffle(newCfg.Replicas, func(i, j int) { randPeerSet[i], randPeerSet[j] = randPeerSet[j], randPeerSet[i] }) 6313 // move overlapping peers at the end of randPeerSet and keep a tally of non overlapping peers 6314 dropPeerSet := make([]string, 0, len(cca.Group.Peers)) 6315 for _, p := range cca.Group.Peers { 6316 found := false 6317 for i, rp := range randPeerSet { 6318 if p == rp { 6319 randPeerSet[i] = randPeerSet[newCfg.Replicas-1] 6320 randPeerSet[newCfg.Replicas-1] = p 6321 found = true 6322 break 6323 } 6324 } 6325 if !found { 6326 dropPeerSet = append(dropPeerSet, p) 6327 } 6328 } 6329 cPeerSet := randPeerSet[newCfg.Replicas-r:] 6330 // In case of a set or cancel simply assign 6331 if len(peerSet) == newCfg.Replicas { 6332 cca.Group.Peers = cPeerSet 6333 } else { 6334 cca.Group.Peers = append(dropPeerSet, cPeerSet...) 6335 } 6336 // make sure it overlaps with peers and remove if not 6337 if cca.Group.Preferred != _EMPTY_ { 6338 found := false 6339 for _, p := range cca.Group.Peers { 6340 if p == cca.Group.Preferred { 6341 found = true 6342 break 6343 } 6344 } 6345 if !found { 6346 cca.Group.Preferred = _EMPTY_ 6347 } 6348 } 6349 // We can not propose here before the stream itself so we collect them. 6350 consumers = append(consumers, cca) 6351 } 6352 } else { 6353 // All other updates make sure no preferred is set. 6354 rg.Preferred = _EMPTY_ 6355 } 6356 6357 sa := &streamAssignment{Group: rg, Sync: osa.Sync, Created: osa.Created, Config: newCfg, Subject: subject, Reply: reply, Client: ci} 6358 meta.Propose(encodeUpdateStreamAssignment(sa)) 6359 6360 // Process any staged consumers. 6361 for _, ca := range consumers { 6362 meta.Propose(encodeAddConsumerAssignment(ca)) 6363 } 6364 } 6365 6366 func (s *Server) jsClusteredStreamDeleteRequest(ci *ClientInfo, acc *Account, stream, subject, reply string, rmsg []byte) { 6367 js, cc := s.getJetStreamCluster() 6368 if js == nil || cc == nil { 6369 return 6370 } 6371 6372 js.mu.Lock() 6373 defer js.mu.Unlock() 6374 6375 if cc.meta == nil { 6376 return 6377 } 6378 6379 osa := js.streamAssignment(acc.Name, stream) 6380 if osa == nil { 6381 var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}} 6382 resp.Error = NewJSStreamNotFoundError() 6383 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6384 return 6385 } 6386 6387 sa := &streamAssignment{Group: osa.Group, Config: osa.Config, Subject: subject, Reply: reply, Client: ci} 6388 cc.meta.Propose(encodeDeleteStreamAssignment(sa)) 6389 } 6390 6391 // Process a clustered purge request. 6392 func (s *Server) jsClusteredStreamPurgeRequest( 6393 ci *ClientInfo, 6394 acc *Account, 6395 mset *stream, 6396 stream, subject, reply string, 6397 rmsg []byte, 6398 preq *JSApiStreamPurgeRequest, 6399 ) { 6400 js, cc := s.getJetStreamCluster() 6401 if js == nil || cc == nil { 6402 return 6403 } 6404 6405 js.mu.Lock() 6406 sa := js.streamAssignment(acc.Name, stream) 6407 if sa == nil { 6408 resp := JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}} 6409 resp.Error = NewJSStreamNotFoundError() 6410 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6411 js.mu.Unlock() 6412 return 6413 } 6414 6415 if n := sa.Group.node; n != nil { 6416 sp := &streamPurge{Stream: stream, LastSeq: mset.state().LastSeq, Subject: subject, Reply: reply, Client: ci, Request: preq} 6417 n.Propose(encodeStreamPurge(sp)) 6418 js.mu.Unlock() 6419 return 6420 } 6421 js.mu.Unlock() 6422 6423 if mset == nil { 6424 return 6425 } 6426 6427 var resp = JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}} 6428 purged, err := mset.purge(preq) 6429 if err != nil { 6430 resp.Error = NewJSStreamGeneralError(err, Unless(err)) 6431 } else { 6432 resp.Purged = purged 6433 resp.Success = true 6434 } 6435 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6436 } 6437 6438 func (s *Server) jsClusteredStreamRestoreRequest( 6439 ci *ClientInfo, 6440 acc *Account, 6441 req *JSApiStreamRestoreRequest, 6442 stream, subject, reply string, rmsg []byte) { 6443 6444 js, cc := s.getJetStreamCluster() 6445 if js == nil || cc == nil { 6446 return 6447 } 6448 6449 js.mu.Lock() 6450 defer js.mu.Unlock() 6451 6452 if cc.meta == nil { 6453 return 6454 } 6455 6456 cfg := &req.Config 6457 resp := JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}} 6458 6459 if err := js.jsClusteredStreamLimitsCheck(acc, cfg); err != nil { 6460 resp.Error = err 6461 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6462 return 6463 } 6464 6465 if sa := js.streamAssignment(ci.serviceAccount(), cfg.Name); sa != nil { 6466 resp.Error = NewJSStreamNameExistRestoreFailedError() 6467 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6468 return 6469 } 6470 6471 // Raft group selection and placement. 6472 rg, err := js.createGroupForStream(ci, cfg) 6473 if err != nil { 6474 resp.Error = NewJSClusterNoPeersError(err) 6475 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6476 return 6477 } 6478 // Pick a preferred leader. 6479 rg.setPreferred() 6480 sa := &streamAssignment{Group: rg, Sync: syncSubjForStream(), Config: cfg, Subject: subject, Reply: reply, Client: ci, Created: time.Now().UTC()} 6481 // Now add in our restore state and pre-select a peer to handle the actual receipt of the snapshot. 6482 sa.Restore = &req.State 6483 cc.meta.Propose(encodeAddStreamAssignment(sa)) 6484 } 6485 6486 // Determine if all peers for this group are offline. 6487 func (s *Server) allPeersOffline(rg *raftGroup) bool { 6488 if rg == nil { 6489 return false 6490 } 6491 // Check to see if this stream has any servers online to respond. 6492 for _, peer := range rg.Peers { 6493 if si, ok := s.nodeToInfo.Load(peer); ok && si != nil { 6494 if !si.(nodeInfo).offline { 6495 return false 6496 } 6497 } 6498 } 6499 return true 6500 } 6501 6502 // This will do a scatter and gather operation for all streams for this account. This is only called from metadata leader. 6503 // This will be running in a separate Go routine. 6504 func (s *Server) jsClusteredStreamListRequest(acc *Account, ci *ClientInfo, filter string, offset int, subject, reply string, rmsg []byte) { 6505 defer s.grWG.Done() 6506 6507 js, cc := s.getJetStreamCluster() 6508 if js == nil || cc == nil { 6509 return 6510 } 6511 6512 js.mu.RLock() 6513 6514 var streams []*streamAssignment 6515 for _, sa := range cc.streams[acc.Name] { 6516 if IsNatsErr(sa.err, JSClusterNotAssignedErr) { 6517 continue 6518 } 6519 6520 if filter != _EMPTY_ { 6521 // These could not have subjects auto-filled in since they are raw and unprocessed. 6522 if len(sa.Config.Subjects) == 0 { 6523 if SubjectsCollide(filter, sa.Config.Name) { 6524 streams = append(streams, sa) 6525 } 6526 } else { 6527 for _, subj := range sa.Config.Subjects { 6528 if SubjectsCollide(filter, subj) { 6529 streams = append(streams, sa) 6530 break 6531 } 6532 } 6533 } 6534 } else { 6535 streams = append(streams, sa) 6536 } 6537 } 6538 6539 // Needs to be sorted for offsets etc. 6540 if len(streams) > 1 { 6541 sort.Slice(streams, func(i, j int) bool { 6542 return strings.Compare(streams[i].Config.Name, streams[j].Config.Name) < 0 6543 }) 6544 } 6545 6546 scnt := len(streams) 6547 if offset > scnt { 6548 offset = scnt 6549 } 6550 if offset > 0 { 6551 streams = streams[offset:] 6552 } 6553 if len(streams) > JSApiListLimit { 6554 streams = streams[:JSApiListLimit] 6555 } 6556 6557 var resp = JSApiStreamListResponse{ 6558 ApiResponse: ApiResponse{Type: JSApiStreamListResponseType}, 6559 Streams: make([]*StreamInfo, 0, len(streams)), 6560 } 6561 6562 js.mu.RUnlock() 6563 6564 if len(streams) == 0 { 6565 resp.Limit = JSApiListLimit 6566 resp.Offset = offset 6567 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6568 return 6569 } 6570 6571 // Create an inbox for our responses and send out our requests. 6572 s.mu.Lock() 6573 inbox := s.newRespInbox() 6574 rc := make(chan *StreamInfo, len(streams)) 6575 6576 // Store our handler. 6577 s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) { 6578 var si StreamInfo 6579 if err := json.Unmarshal(msg, &si); err != nil { 6580 s.Warnf("Error unmarshalling clustered stream info response:%v", err) 6581 return 6582 } 6583 select { 6584 case rc <- &si: 6585 default: 6586 s.Warnf("Failed placing remote stream info result on internal channel") 6587 } 6588 } 6589 s.mu.Unlock() 6590 6591 // Cleanup after. 6592 defer func() { 6593 s.mu.Lock() 6594 if s.sys != nil && s.sys.replies != nil { 6595 delete(s.sys.replies, inbox) 6596 } 6597 s.mu.Unlock() 6598 }() 6599 6600 var missingNames []string 6601 sent := map[string]int{} 6602 6603 // Send out our requests here. 6604 js.mu.RLock() 6605 for _, sa := range streams { 6606 if s.allPeersOffline(sa.Group) { 6607 // Place offline onto our results by hand here. 6608 si := &StreamInfo{ 6609 Config: *sa.Config, 6610 Created: sa.Created, 6611 Cluster: js.offlineClusterInfo(sa.Group), 6612 TimeStamp: time.Now().UTC(), 6613 } 6614 resp.Streams = append(resp.Streams, si) 6615 missingNames = append(missingNames, sa.Config.Name) 6616 } else { 6617 isubj := fmt.Sprintf(clusterStreamInfoT, sa.Client.serviceAccount(), sa.Config.Name) 6618 s.sendInternalMsgLocked(isubj, inbox, nil, nil) 6619 sent[sa.Config.Name] = len(sa.consumers) 6620 } 6621 } 6622 // Don't hold lock. 6623 js.mu.RUnlock() 6624 6625 const timeout = 4 * time.Second 6626 notActive := time.NewTimer(timeout) 6627 defer notActive.Stop() 6628 6629 LOOP: 6630 for len(sent) > 0 { 6631 select { 6632 case <-s.quitCh: 6633 return 6634 case <-notActive.C: 6635 s.Warnf("Did not receive all stream info results for %q", acc) 6636 for sName := range sent { 6637 missingNames = append(missingNames, sName) 6638 } 6639 break LOOP 6640 case si := <-rc: 6641 consCount := sent[si.Config.Name] 6642 if consCount > 0 { 6643 si.State.Consumers = consCount 6644 } 6645 delete(sent, si.Config.Name) 6646 resp.Streams = append(resp.Streams, si) 6647 // Check to see if we are done. 6648 if len(resp.Streams) == len(streams) { 6649 break LOOP 6650 } 6651 } 6652 } 6653 6654 // Needs to be sorted as well. 6655 if len(resp.Streams) > 1 { 6656 sort.Slice(resp.Streams, func(i, j int) bool { 6657 return strings.Compare(resp.Streams[i].Config.Name, resp.Streams[j].Config.Name) < 0 6658 }) 6659 } 6660 6661 resp.Total = scnt 6662 resp.Limit = JSApiListLimit 6663 resp.Offset = offset 6664 resp.Missing = missingNames 6665 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6666 } 6667 6668 // This will do a scatter and gather operation for all consumers for this stream and account. 6669 // This will be running in a separate Go routine. 6670 func (s *Server) jsClusteredConsumerListRequest(acc *Account, ci *ClientInfo, offset int, stream, subject, reply string, rmsg []byte) { 6671 defer s.grWG.Done() 6672 6673 js, cc := s.getJetStreamCluster() 6674 if js == nil || cc == nil { 6675 return 6676 } 6677 6678 js.mu.RLock() 6679 6680 var consumers []*consumerAssignment 6681 if sas := cc.streams[acc.Name]; sas != nil { 6682 if sa := sas[stream]; sa != nil { 6683 // Copy over since we need to sort etc. 6684 for _, ca := range sa.consumers { 6685 consumers = append(consumers, ca) 6686 } 6687 } 6688 } 6689 // Needs to be sorted. 6690 if len(consumers) > 1 { 6691 sort.Slice(consumers, func(i, j int) bool { 6692 return strings.Compare(consumers[i].Name, consumers[j].Name) < 0 6693 }) 6694 } 6695 6696 ocnt := len(consumers) 6697 if offset > ocnt { 6698 offset = ocnt 6699 } 6700 if offset > 0 { 6701 consumers = consumers[offset:] 6702 } 6703 if len(consumers) > JSApiListLimit { 6704 consumers = consumers[:JSApiListLimit] 6705 } 6706 6707 // Send out our requests here. 6708 var resp = JSApiConsumerListResponse{ 6709 ApiResponse: ApiResponse{Type: JSApiConsumerListResponseType}, 6710 Consumers: []*ConsumerInfo{}, 6711 } 6712 6713 js.mu.RUnlock() 6714 6715 if len(consumers) == 0 { 6716 resp.Limit = JSApiListLimit 6717 resp.Offset = offset 6718 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6719 return 6720 } 6721 6722 // Create an inbox for our responses and send out requests. 6723 s.mu.Lock() 6724 inbox := s.newRespInbox() 6725 rc := make(chan *ConsumerInfo, len(consumers)) 6726 6727 // Store our handler. 6728 s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) { 6729 var ci ConsumerInfo 6730 if err := json.Unmarshal(msg, &ci); err != nil { 6731 s.Warnf("Error unmarshaling clustered consumer info response:%v", err) 6732 return 6733 } 6734 select { 6735 case rc <- &ci: 6736 default: 6737 s.Warnf("Failed placing consumer info result on internal chan") 6738 } 6739 } 6740 s.mu.Unlock() 6741 6742 // Cleanup after. 6743 defer func() { 6744 s.mu.Lock() 6745 if s.sys != nil && s.sys.replies != nil { 6746 delete(s.sys.replies, inbox) 6747 } 6748 s.mu.Unlock() 6749 }() 6750 6751 var missingNames []string 6752 sent := map[string]struct{}{} 6753 6754 // Send out our requests here. 6755 js.mu.RLock() 6756 for _, ca := range consumers { 6757 if s.allPeersOffline(ca.Group) { 6758 // Place offline onto our results by hand here. 6759 ci := &ConsumerInfo{ 6760 Config: ca.Config, 6761 Created: ca.Created, 6762 Cluster: js.offlineClusterInfo(ca.Group), 6763 TimeStamp: time.Now().UTC(), 6764 } 6765 resp.Consumers = append(resp.Consumers, ci) 6766 missingNames = append(missingNames, ca.Name) 6767 } else { 6768 isubj := fmt.Sprintf(clusterConsumerInfoT, ca.Client.serviceAccount(), stream, ca.Name) 6769 s.sendInternalMsgLocked(isubj, inbox, nil, nil) 6770 sent[ca.Name] = struct{}{} 6771 } 6772 } 6773 // Don't hold lock. 6774 js.mu.RUnlock() 6775 6776 const timeout = 4 * time.Second 6777 notActive := time.NewTimer(timeout) 6778 defer notActive.Stop() 6779 6780 LOOP: 6781 for len(sent) > 0 { 6782 select { 6783 case <-s.quitCh: 6784 return 6785 case <-notActive.C: 6786 s.Warnf("Did not receive all consumer info results for '%s > %s'", acc, stream) 6787 for cName := range sent { 6788 missingNames = append(missingNames, cName) 6789 } 6790 break LOOP 6791 case ci := <-rc: 6792 delete(sent, ci.Name) 6793 resp.Consumers = append(resp.Consumers, ci) 6794 // Check to see if we are done. 6795 if len(resp.Consumers) == len(consumers) { 6796 break LOOP 6797 } 6798 } 6799 } 6800 6801 // Needs to be sorted as well. 6802 if len(resp.Consumers) > 1 { 6803 sort.Slice(resp.Consumers, func(i, j int) bool { 6804 return strings.Compare(resp.Consumers[i].Name, resp.Consumers[j].Name) < 0 6805 }) 6806 } 6807 6808 resp.Total = ocnt 6809 resp.Limit = JSApiListLimit 6810 resp.Offset = offset 6811 resp.Missing = missingNames 6812 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6813 } 6814 6815 func encodeStreamPurge(sp *streamPurge) []byte { 6816 var bb bytes.Buffer 6817 bb.WriteByte(byte(purgeStreamOp)) 6818 json.NewEncoder(&bb).Encode(sp) 6819 return bb.Bytes() 6820 } 6821 6822 func decodeStreamPurge(buf []byte) (*streamPurge, error) { 6823 var sp streamPurge 6824 err := json.Unmarshal(buf, &sp) 6825 return &sp, err 6826 } 6827 6828 func (s *Server) jsClusteredConsumerDeleteRequest(ci *ClientInfo, acc *Account, stream, consumer, subject, reply string, rmsg []byte) { 6829 js, cc := s.getJetStreamCluster() 6830 if js == nil || cc == nil { 6831 return 6832 } 6833 6834 js.mu.Lock() 6835 defer js.mu.Unlock() 6836 6837 if cc.meta == nil { 6838 return 6839 } 6840 6841 var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}} 6842 6843 sa := js.streamAssignment(acc.Name, stream) 6844 if sa == nil { 6845 resp.Error = NewJSStreamNotFoundError() 6846 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6847 return 6848 6849 } 6850 if sa.consumers == nil { 6851 resp.Error = NewJSConsumerNotFoundError() 6852 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6853 return 6854 } 6855 oca := sa.consumers[consumer] 6856 if oca == nil { 6857 resp.Error = NewJSConsumerNotFoundError() 6858 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6859 return 6860 } 6861 oca.deleted = true 6862 ca := &consumerAssignment{Group: oca.Group, Stream: stream, Name: consumer, Config: oca.Config, Subject: subject, Reply: reply, Client: ci} 6863 cc.meta.Propose(encodeDeleteConsumerAssignment(ca)) 6864 } 6865 6866 func encodeMsgDelete(md *streamMsgDelete) []byte { 6867 var bb bytes.Buffer 6868 bb.WriteByte(byte(deleteMsgOp)) 6869 json.NewEncoder(&bb).Encode(md) 6870 return bb.Bytes() 6871 } 6872 6873 func decodeMsgDelete(buf []byte) (*streamMsgDelete, error) { 6874 var md streamMsgDelete 6875 err := json.Unmarshal(buf, &md) 6876 return &md, err 6877 } 6878 6879 func (s *Server) jsClusteredMsgDeleteRequest(ci *ClientInfo, acc *Account, mset *stream, stream, subject, reply string, req *JSApiMsgDeleteRequest, rmsg []byte) { 6880 js, cc := s.getJetStreamCluster() 6881 if js == nil || cc == nil { 6882 return 6883 } 6884 6885 js.mu.Lock() 6886 sa := js.streamAssignment(acc.Name, stream) 6887 if sa == nil { 6888 s.Debugf("Message delete failed, could not locate stream '%s > %s'", acc.Name, stream) 6889 js.mu.Unlock() 6890 return 6891 } 6892 6893 // Check for single replica items. 6894 if n := sa.Group.node; n != nil { 6895 md := streamMsgDelete{Seq: req.Seq, NoErase: req.NoErase, Stream: stream, Subject: subject, Reply: reply, Client: ci} 6896 n.Propose(encodeMsgDelete(&md)) 6897 js.mu.Unlock() 6898 return 6899 } 6900 js.mu.Unlock() 6901 6902 if mset == nil { 6903 return 6904 } 6905 6906 var err error 6907 var removed bool 6908 if req.NoErase { 6909 removed, err = mset.removeMsg(req.Seq) 6910 } else { 6911 removed, err = mset.eraseMsg(req.Seq) 6912 } 6913 var resp = JSApiMsgDeleteResponse{ApiResponse: ApiResponse{Type: JSApiMsgDeleteResponseType}} 6914 if err != nil { 6915 resp.Error = NewJSStreamMsgDeleteFailedError(err, Unless(err)) 6916 } else if !removed { 6917 resp.Error = NewJSSequenceNotFoundError(req.Seq) 6918 } else { 6919 resp.Success = true 6920 } 6921 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6922 } 6923 6924 func encodeAddStreamAssignment(sa *streamAssignment) []byte { 6925 var bb bytes.Buffer 6926 bb.WriteByte(byte(assignStreamOp)) 6927 json.NewEncoder(&bb).Encode(sa) 6928 return bb.Bytes() 6929 } 6930 6931 func encodeUpdateStreamAssignment(sa *streamAssignment) []byte { 6932 var bb bytes.Buffer 6933 bb.WriteByte(byte(updateStreamOp)) 6934 json.NewEncoder(&bb).Encode(sa) 6935 return bb.Bytes() 6936 } 6937 6938 func encodeDeleteStreamAssignment(sa *streamAssignment) []byte { 6939 var bb bytes.Buffer 6940 bb.WriteByte(byte(removeStreamOp)) 6941 json.NewEncoder(&bb).Encode(sa) 6942 return bb.Bytes() 6943 } 6944 6945 func decodeStreamAssignment(buf []byte) (*streamAssignment, error) { 6946 var sa streamAssignment 6947 err := json.Unmarshal(buf, &sa) 6948 if err != nil { 6949 return nil, err 6950 } 6951 fixCfgMirrorWithDedupWindow(sa.Config) 6952 return &sa, err 6953 } 6954 6955 func encodeDeleteRange(dr *DeleteRange) []byte { 6956 var bb bytes.Buffer 6957 bb.WriteByte(byte(deleteRangeOp)) 6958 json.NewEncoder(&bb).Encode(dr) 6959 return bb.Bytes() 6960 } 6961 6962 func decodeDeleteRange(buf []byte) (*DeleteRange, error) { 6963 var dr DeleteRange 6964 err := json.Unmarshal(buf, &dr) 6965 if err != nil { 6966 return nil, err 6967 } 6968 return &dr, err 6969 } 6970 6971 // createGroupForConsumer will create a new group from same peer set as the stream. 6972 func (cc *jetStreamCluster) createGroupForConsumer(cfg *ConsumerConfig, sa *streamAssignment) *raftGroup { 6973 if len(sa.Group.Peers) == 0 || cfg.Replicas > len(sa.Group.Peers) { 6974 return nil 6975 } 6976 6977 peers := copyStrings(sa.Group.Peers) 6978 var _ss [5]string 6979 active := _ss[:0] 6980 6981 // Calculate all active peers. 6982 for _, peer := range peers { 6983 if sir, ok := cc.s.nodeToInfo.Load(peer); ok && sir != nil { 6984 if !sir.(nodeInfo).offline { 6985 active = append(active, peer) 6986 } 6987 } 6988 } 6989 if quorum := cfg.Replicas/2 + 1; quorum > len(active) { 6990 // Not enough active to satisfy the request. 6991 return nil 6992 } 6993 6994 // If we want less then our parent stream, select from active. 6995 if cfg.Replicas > 0 && cfg.Replicas < len(peers) { 6996 // Pedantic in case stream is say R5 and consumer is R3 and 3 or more offline, etc. 6997 if len(active) < cfg.Replicas { 6998 return nil 6999 } 7000 // First shuffle the active peers and then select to account for replica = 1. 7001 rand.Shuffle(len(active), func(i, j int) { active[i], active[j] = active[j], active[i] }) 7002 peers = active[:cfg.Replicas] 7003 } 7004 storage := sa.Config.Storage 7005 if cfg.MemoryStorage { 7006 storage = MemoryStorage 7007 } 7008 return &raftGroup{Name: groupNameForConsumer(peers, storage), Storage: storage, Peers: peers} 7009 } 7010 7011 // jsClusteredConsumerRequest is first point of entry to create a consumer with R > 1. 7012 func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, stream string, cfg *ConsumerConfig, action ConsumerAction) { 7013 js, cc := s.getJetStreamCluster() 7014 if js == nil || cc == nil { 7015 return 7016 } 7017 7018 var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} 7019 7020 streamCfg, ok := js.clusterStreamConfig(acc.Name, stream) 7021 if !ok { 7022 resp.Error = NewJSStreamNotFoundError() 7023 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7024 return 7025 } 7026 selectedLimits, _, _, apiErr := acc.selectLimits(&streamCfg) 7027 if apiErr != nil { 7028 resp.Error = apiErr 7029 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7030 return 7031 } 7032 srvLim := &s.getOpts().JetStreamLimits 7033 // Make sure we have sane defaults 7034 setConsumerConfigDefaults(cfg, &streamCfg, srvLim, selectedLimits) 7035 7036 if err := checkConsumerCfg(cfg, srvLim, &streamCfg, acc, selectedLimits, false); err != nil { 7037 resp.Error = err 7038 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7039 return 7040 } 7041 7042 js.mu.Lock() 7043 defer js.mu.Unlock() 7044 7045 if cc.meta == nil { 7046 return 7047 } 7048 7049 // Lookup the stream assignment. 7050 sa := js.streamAssignment(acc.Name, stream) 7051 if sa == nil { 7052 resp.Error = NewJSStreamNotFoundError() 7053 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7054 return 7055 } 7056 7057 // Check for max consumers here to short circuit if possible. 7058 // Start with limit on a stream, but if one is defined at the level of the account 7059 // and is lower, use that limit. 7060 maxc := sa.Config.MaxConsumers 7061 if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) { 7062 maxc = selectedLimits.MaxConsumers 7063 } 7064 if maxc > 0 { 7065 // Don't count DIRECTS. 7066 total := 0 7067 for _, ca := range sa.consumers { 7068 if ca.Config != nil && !ca.Config.Direct { 7069 total++ 7070 } 7071 } 7072 if total >= maxc { 7073 resp.Error = NewJSMaximumConsumersLimitError() 7074 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7075 return 7076 } 7077 } 7078 7079 // Also short circuit if DeliverLastPerSubject is set with no FilterSubject. 7080 if cfg.DeliverPolicy == DeliverLastPerSubject { 7081 if cfg.FilterSubject == _EMPTY_ && len(cfg.FilterSubjects) == 0 { 7082 resp.Error = NewJSConsumerInvalidPolicyError(fmt.Errorf("consumer delivery policy is deliver last per subject, but FilterSubject is not set")) 7083 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7084 return 7085 } 7086 } 7087 7088 // Setup proper default for ack wait if we are in explicit ack mode. 7089 if cfg.AckWait == 0 && (cfg.AckPolicy == AckExplicit || cfg.AckPolicy == AckAll) { 7090 cfg.AckWait = JsAckWaitDefault 7091 } 7092 // Setup default of -1, meaning no limit for MaxDeliver. 7093 if cfg.MaxDeliver == 0 { 7094 cfg.MaxDeliver = -1 7095 } 7096 // Set proper default for max ack pending if we are ack explicit and none has been set. 7097 if cfg.AckPolicy == AckExplicit && cfg.MaxAckPending == 0 { 7098 cfg.MaxAckPending = JsDefaultMaxAckPending 7099 } 7100 7101 var ca *consumerAssignment 7102 var oname string 7103 7104 // See if we have an existing one already under same durable name or 7105 // if name was set by the user. 7106 if isDurableConsumer(cfg) || cfg.Name != _EMPTY_ { 7107 if cfg.Name != _EMPTY_ { 7108 oname = cfg.Name 7109 } else { 7110 oname = cfg.Durable 7111 } 7112 if ca = sa.consumers[oname]; ca != nil && !ca.deleted { 7113 if action == ActionCreate && !reflect.DeepEqual(cfg, ca.Config) { 7114 resp.Error = NewJSConsumerAlreadyExistsError() 7115 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7116 return 7117 } 7118 // Do quick sanity check on new cfg to prevent here if possible. 7119 if err := acc.checkNewConsumerConfig(ca.Config, cfg); err != nil { 7120 resp.Error = NewJSConsumerCreateError(err, Unless(err)) 7121 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7122 return 7123 } 7124 } 7125 } 7126 7127 // If this is new consumer. 7128 if ca == nil { 7129 if action == ActionUpdate { 7130 resp.Error = NewJSConsumerDoesNotExistError() 7131 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7132 return 7133 } 7134 rg := cc.createGroupForConsumer(cfg, sa) 7135 if rg == nil { 7136 resp.Error = NewJSInsufficientResourcesError() 7137 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7138 return 7139 } 7140 // Pick a preferred leader. 7141 rg.setPreferred() 7142 7143 // Inherit cluster from stream. 7144 rg.Cluster = sa.Group.Cluster 7145 7146 // We need to set the ephemeral here before replicating. 7147 if !isDurableConsumer(cfg) { 7148 // We chose to have ephemerals be R=1 unless stream is interest or workqueue. 7149 // Consumer can override. 7150 if sa.Config.Retention == LimitsPolicy && cfg.Replicas <= 1 { 7151 rg.Peers = []string{rg.Preferred} 7152 rg.Name = groupNameForConsumer(rg.Peers, rg.Storage) 7153 } 7154 if cfg.Name != _EMPTY_ { 7155 oname = cfg.Name 7156 } else { 7157 // Make sure name is unique. 7158 for { 7159 oname = createConsumerName() 7160 if sa.consumers != nil { 7161 if sa.consumers[oname] != nil { 7162 continue 7163 } 7164 } 7165 break 7166 } 7167 } 7168 } 7169 if len(rg.Peers) > 1 { 7170 if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets != 0 { 7171 for _, peer := range rg.Peers { 7172 if ni, ok := s.nodeToInfo.Load(peer); ok { 7173 ni := ni.(nodeInfo) 7174 if stats := ni.stats; stats != nil && stats.HAAssets > maxHaAssets { 7175 resp.Error = NewJSInsufficientResourcesError() 7176 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7177 s.Warnf("%s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d"+ 7178 " for (durable) consumer %s placement on stream %s", 7179 ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets, oname, stream) 7180 return 7181 } 7182 } 7183 } 7184 } 7185 } 7186 ca = &consumerAssignment{ 7187 Group: rg, 7188 Stream: stream, 7189 Name: oname, 7190 Config: cfg, 7191 Subject: subject, 7192 Reply: reply, 7193 Client: ci, 7194 Created: time.Now().UTC(), 7195 } 7196 } else { 7197 // If the consumer already exists then don't allow updating the PauseUntil, just set 7198 // it back to whatever the current configured value is. 7199 cfg.PauseUntil = ca.Config.PauseUntil 7200 7201 nca := ca.copyGroup() 7202 7203 rBefore := nca.Config.replicas(sa.Config) 7204 rAfter := cfg.replicas(sa.Config) 7205 7206 var curLeader string 7207 if rBefore != rAfter { 7208 // We are modifying nodes here. We want to do our best to preserve the current leader. 7209 // We have support now from above that guarantees we are in our own Go routine, so can 7210 // ask for stream info from the stream leader to make sure we keep the leader in the new list. 7211 if !s.allPeersOffline(ca.Group) { 7212 // Need to release js lock. 7213 js.mu.Unlock() 7214 if ci, err := sysRequest[ConsumerInfo](s, clusterConsumerInfoT, ci.serviceAccount(), sa.Config.Name, cfg.Durable); err != nil { 7215 s.Warnf("Did not receive consumer info results for '%s > %s > %s' due to: %s", acc, sa.Config.Name, cfg.Durable, err) 7216 } else if ci != nil { 7217 if cl := ci.Cluster; cl != nil { 7218 curLeader = getHash(cl.Leader) 7219 } 7220 } 7221 // Re-acquire here. 7222 js.mu.Lock() 7223 } 7224 } 7225 7226 if rBefore < rAfter { 7227 newPeerSet := nca.Group.Peers 7228 // scale up by adding new members from the stream peer set that are not yet in the consumer peer set 7229 streamPeerSet := copyStrings(sa.Group.Peers) 7230 rand.Shuffle(rAfter, func(i, j int) { streamPeerSet[i], streamPeerSet[j] = streamPeerSet[j], streamPeerSet[i] }) 7231 for _, p := range streamPeerSet { 7232 found := false 7233 for _, sp := range newPeerSet { 7234 if sp == p { 7235 found = true 7236 break 7237 } 7238 } 7239 if !found { 7240 newPeerSet = append(newPeerSet, p) 7241 if len(newPeerSet) == rAfter { 7242 break 7243 } 7244 } 7245 } 7246 nca.Group.Peers = newPeerSet 7247 nca.Group.Preferred = curLeader 7248 } else if rBefore > rAfter { 7249 newPeerSet := nca.Group.Peers 7250 // mark leader preferred and move it to end 7251 nca.Group.Preferred = curLeader 7252 if nca.Group.Preferred != _EMPTY_ { 7253 for i, p := range newPeerSet { 7254 if nca.Group.Preferred == p { 7255 newPeerSet[i] = newPeerSet[len(newPeerSet)-1] 7256 newPeerSet[len(newPeerSet)-1] = p 7257 } 7258 } 7259 } 7260 // scale down by removing peers from the end 7261 newPeerSet = newPeerSet[len(newPeerSet)-rAfter:] 7262 nca.Group.Peers = newPeerSet 7263 } 7264 7265 // Update config and client info on copy of existing. 7266 nca.Config = cfg 7267 nca.Client = ci 7268 nca.Subject = subject 7269 nca.Reply = reply 7270 ca = nca 7271 } 7272 7273 eca := encodeAddConsumerAssignment(ca) 7274 7275 // Mark this as pending. 7276 if sa.consumers == nil { 7277 sa.consumers = make(map[string]*consumerAssignment) 7278 } 7279 sa.consumers[ca.Name] = ca 7280 7281 // Do formal proposal. 7282 cc.meta.Propose(eca) 7283 } 7284 7285 func encodeAddConsumerAssignment(ca *consumerAssignment) []byte { 7286 var bb bytes.Buffer 7287 bb.WriteByte(byte(assignConsumerOp)) 7288 json.NewEncoder(&bb).Encode(ca) 7289 return bb.Bytes() 7290 } 7291 7292 func encodeDeleteConsumerAssignment(ca *consumerAssignment) []byte { 7293 var bb bytes.Buffer 7294 bb.WriteByte(byte(removeConsumerOp)) 7295 json.NewEncoder(&bb).Encode(ca) 7296 return bb.Bytes() 7297 } 7298 7299 func decodeConsumerAssignment(buf []byte) (*consumerAssignment, error) { 7300 var ca consumerAssignment 7301 err := json.Unmarshal(buf, &ca) 7302 return &ca, err 7303 } 7304 7305 func encodeAddConsumerAssignmentCompressed(ca *consumerAssignment) []byte { 7306 b, err := json.Marshal(ca) 7307 if err != nil { 7308 return nil 7309 } 7310 // TODO(dlc) - Streaming better approach here probably. 7311 var bb bytes.Buffer 7312 bb.WriteByte(byte(assignCompressedConsumerOp)) 7313 bb.Write(s2.Encode(nil, b)) 7314 return bb.Bytes() 7315 } 7316 7317 func decodeConsumerAssignmentCompressed(buf []byte) (*consumerAssignment, error) { 7318 var ca consumerAssignment 7319 js, err := s2.Decode(nil, buf) 7320 if err != nil { 7321 return nil, err 7322 } 7323 err = json.Unmarshal(js, &ca) 7324 return &ca, err 7325 } 7326 7327 var errBadStreamMsg = errors.New("jetstream cluster bad replicated stream msg") 7328 7329 func decodeStreamMsg(buf []byte) (subject, reply string, hdr, msg []byte, lseq uint64, ts int64, err error) { 7330 var le = binary.LittleEndian 7331 if len(buf) < 26 { 7332 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7333 } 7334 lseq = le.Uint64(buf) 7335 buf = buf[8:] 7336 ts = int64(le.Uint64(buf)) 7337 buf = buf[8:] 7338 sl := int(le.Uint16(buf)) 7339 buf = buf[2:] 7340 if len(buf) < sl { 7341 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7342 } 7343 subject = string(buf[:sl]) 7344 buf = buf[sl:] 7345 if len(buf) < 2 { 7346 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7347 } 7348 rl := int(le.Uint16(buf)) 7349 buf = buf[2:] 7350 if len(buf) < rl { 7351 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7352 } 7353 reply = string(buf[:rl]) 7354 buf = buf[rl:] 7355 if len(buf) < 2 { 7356 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7357 } 7358 hl := int(le.Uint16(buf)) 7359 buf = buf[2:] 7360 if len(buf) < hl { 7361 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7362 } 7363 if hdr = buf[:hl]; len(hdr) == 0 { 7364 hdr = nil 7365 } 7366 buf = buf[hl:] 7367 if len(buf) < 4 { 7368 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7369 } 7370 ml := int(le.Uint32(buf)) 7371 buf = buf[4:] 7372 if len(buf) < ml { 7373 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7374 } 7375 if msg = buf[:ml]; len(msg) == 0 { 7376 msg = nil 7377 } 7378 return subject, reply, hdr, msg, lseq, ts, nil 7379 } 7380 7381 // Helper to return if compression allowed. 7382 func (mset *stream) compressAllowed() bool { 7383 mset.clMu.Lock() 7384 defer mset.clMu.Unlock() 7385 return mset.compressOK 7386 } 7387 7388 func encodeStreamMsg(subject, reply string, hdr, msg []byte, lseq uint64, ts int64) []byte { 7389 return encodeStreamMsgAllowCompress(subject, reply, hdr, msg, lseq, ts, false) 7390 } 7391 7392 // Threshold for compression. 7393 // TODO(dlc) - Eventually make configurable. 7394 const compressThreshold = 256 7395 7396 // If allowed and contents over the threshold we will compress. 7397 func encodeStreamMsgAllowCompress(subject, reply string, hdr, msg []byte, lseq uint64, ts int64, compressOK bool) []byte { 7398 shouldCompress := compressOK && len(subject)+len(reply)+len(hdr)+len(msg) > compressThreshold 7399 7400 elen := 1 + 8 + 8 + len(subject) + len(reply) + len(hdr) + len(msg) 7401 elen += (2 + 2 + 2 + 4) // Encoded lengths, 4bytes 7402 // TODO(dlc) - check sizes of subject, reply and hdr, make sure uint16 ok. 7403 buf := make([]byte, elen) 7404 buf[0] = byte(streamMsgOp) 7405 var le = binary.LittleEndian 7406 wi := 1 7407 le.PutUint64(buf[wi:], lseq) 7408 wi += 8 7409 le.PutUint64(buf[wi:], uint64(ts)) 7410 wi += 8 7411 le.PutUint16(buf[wi:], uint16(len(subject))) 7412 wi += 2 7413 copy(buf[wi:], subject) 7414 wi += len(subject) 7415 le.PutUint16(buf[wi:], uint16(len(reply))) 7416 wi += 2 7417 copy(buf[wi:], reply) 7418 wi += len(reply) 7419 le.PutUint16(buf[wi:], uint16(len(hdr))) 7420 wi += 2 7421 if len(hdr) > 0 { 7422 copy(buf[wi:], hdr) 7423 wi += len(hdr) 7424 } 7425 le.PutUint32(buf[wi:], uint32(len(msg))) 7426 wi += 4 7427 if len(msg) > 0 { 7428 copy(buf[wi:], msg) 7429 wi += len(msg) 7430 } 7431 7432 // Check if we should compress. 7433 if shouldCompress { 7434 nbuf := make([]byte, s2.MaxEncodedLen(elen)) 7435 nbuf[0] = byte(compressedStreamMsgOp) 7436 ebuf := s2.Encode(nbuf[1:], buf[1:wi]) 7437 // Only pay cost of decode the other side if we compressed. 7438 // S2 will allow us to try without major penalty for non-compressable data. 7439 if len(ebuf) < wi { 7440 nbuf = nbuf[:len(ebuf)+1] 7441 buf, wi = nbuf, len(nbuf) 7442 } 7443 } 7444 7445 return buf[:wi] 7446 } 7447 7448 // Determine if all peers in our set support the binary snapshot. 7449 func (mset *stream) supportsBinarySnapshot() bool { 7450 mset.mu.RLock() 7451 defer mset.mu.RUnlock() 7452 return mset.supportsBinarySnapshotLocked() 7453 } 7454 7455 // Determine if all peers in our set support the binary snapshot. 7456 // Lock should be held. 7457 func (mset *stream) supportsBinarySnapshotLocked() bool { 7458 s, n := mset.srv, mset.node 7459 if s == nil || n == nil { 7460 return false 7461 } 7462 // Grab our peers and walk them to make sure we can all support binary stream snapshots. 7463 id, peers := n.ID(), n.Peers() 7464 for _, p := range peers { 7465 if p.ID == id { 7466 // We know we support ourselves. 7467 continue 7468 } 7469 if sir, ok := s.nodeToInfo.Load(p.ID); !ok || sir == nil || !sir.(nodeInfo).binarySnapshots { 7470 return false 7471 } 7472 } 7473 return true 7474 } 7475 7476 // StreamSnapshot is used for snapshotting and out of band catch up in clustered mode. 7477 // Legacy, replace with binary stream snapshots. 7478 type streamSnapshot struct { 7479 Msgs uint64 `json:"messages"` 7480 Bytes uint64 `json:"bytes"` 7481 FirstSeq uint64 `json:"first_seq"` 7482 LastSeq uint64 `json:"last_seq"` 7483 Failed uint64 `json:"clfs"` 7484 Deleted []uint64 `json:"deleted,omitempty"` 7485 } 7486 7487 // Grab a snapshot of a stream for clustered mode. 7488 func (mset *stream) stateSnapshot() []byte { 7489 mset.mu.RLock() 7490 defer mset.mu.RUnlock() 7491 return mset.stateSnapshotLocked() 7492 } 7493 7494 // Grab a snapshot of a stream for clustered mode. 7495 // Lock should be held. 7496 func (mset *stream) stateSnapshotLocked() []byte { 7497 // Decide if we can support the new style of stream snapshots. 7498 if mset.supportsBinarySnapshotLocked() { 7499 snap, _ := mset.store.EncodedStreamState(mset.getCLFS()) 7500 return snap 7501 } 7502 7503 // Older v1 version with deleted as a sorted []uint64. 7504 state := mset.store.State() 7505 snap := &streamSnapshot{ 7506 Msgs: state.Msgs, 7507 Bytes: state.Bytes, 7508 FirstSeq: state.FirstSeq, 7509 LastSeq: state.LastSeq, 7510 Failed: mset.getCLFS(), 7511 Deleted: state.Deleted, 7512 } 7513 b, _ := json.Marshal(snap) 7514 return b 7515 } 7516 7517 // Will check if we can do message compression in RAFT and catchup logic. 7518 func (mset *stream) checkAllowMsgCompress(peers []string) { 7519 allowed := true 7520 for _, id := range peers { 7521 sir, ok := mset.srv.nodeToInfo.Load(id) 7522 if !ok || sir == nil { 7523 allowed = false 7524 break 7525 } 7526 // Check for capability. 7527 if si := sir.(nodeInfo); si.cfg == nil || !si.cfg.CompressOK { 7528 allowed = false 7529 break 7530 } 7531 } 7532 mset.mu.Lock() 7533 mset.compressOK = allowed 7534 mset.mu.Unlock() 7535 } 7536 7537 // To warn when we are getting too far behind from what has been proposed vs what has been committed. 7538 const streamLagWarnThreshold = 10_000 7539 7540 // processClusteredMsg will propose the inbound message to the underlying raft group. 7541 func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg []byte, mt *msgTrace) (retErr error) { 7542 // For possible error response. 7543 var response []byte 7544 7545 mset.mu.RLock() 7546 canRespond := !mset.cfg.NoAck && len(reply) > 0 7547 name, stype, store := mset.cfg.Name, mset.cfg.Storage, mset.store 7548 s, js, jsa, st, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node 7549 maxMsgSize, lseq, clfs := int(mset.cfg.MaxMsgSize), mset.lseq, mset.clfs 7550 interestPolicy, discard, maxMsgs, maxBytes := mset.cfg.Retention != LimitsPolicy, mset.cfg.Discard, mset.cfg.MaxMsgs, mset.cfg.MaxBytes 7551 isLeader, isSealed := mset.isLeader(), mset.cfg.Sealed 7552 7553 // We need to track state to check limits if interest retention and discard new with max msgs or bytes. 7554 var state StreamState 7555 if interestPolicy && discard == DiscardNew && (maxMsgs > 0 || maxBytes > 0) { 7556 mset.store.FastState(&state) 7557 } 7558 mset.mu.RUnlock() 7559 7560 // This should not happen but possible now that we allow scale up, and scale down where this could trigger. 7561 // 7562 // We also invoke this in clustering mode for message tracing when not 7563 // performing message delivery. 7564 if node == nil || mt.traceOnly() { 7565 return mset.processJetStreamMsg(subject, reply, hdr, msg, 0, 0, mt) 7566 } 7567 7568 // If message tracing (with message delivery), we will need to send the 7569 // event on exit in case there was an error (if message was not proposed). 7570 // Otherwise, the event will be sent from processJetStreamMsg when 7571 // invoked by the leader (from applyStreamEntries). 7572 if mt != nil { 7573 defer func() { 7574 if retErr != nil { 7575 mt.sendEventFromJetStream(retErr) 7576 } 7577 }() 7578 } 7579 7580 // Check that we are the leader. This can be false if we have scaled up from an R1 that had inbound queued messages. 7581 if !isLeader { 7582 return NewJSClusterNotLeaderError() 7583 } 7584 7585 // Bail here if sealed. 7586 if isSealed { 7587 var resp = JSPubAckResponse{PubAck: &PubAck{Stream: mset.name()}, Error: NewJSStreamSealedError()} 7588 b, _ := json.Marshal(resp) 7589 mset.outq.sendMsg(reply, b) 7590 return NewJSStreamSealedError() 7591 } 7592 7593 // Check here pre-emptively if we have exceeded this server limits. 7594 if js.limitsExceeded(stype) { 7595 s.resourcesExceededError() 7596 if canRespond { 7597 b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: NewJSInsufficientResourcesError()}) 7598 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0)) 7599 } 7600 // Stepdown regardless. 7601 if node := mset.raftNode(); node != nil { 7602 node.StepDown() 7603 } 7604 return NewJSInsufficientResourcesError() 7605 } 7606 7607 // Check here pre-emptively if we have exceeded our account limits. 7608 if exceeded, err := jsa.wouldExceedLimits(st, tierName, r, subject, hdr, msg); exceeded { 7609 if err == nil { 7610 err = NewJSAccountResourcesExceededError() 7611 } 7612 s.RateLimitWarnf(err.Error()) 7613 if canRespond { 7614 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7615 resp.Error = err 7616 response, _ = json.Marshal(resp) 7617 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7618 } 7619 return err 7620 } 7621 7622 // Check msgSize if we have a limit set there. Again this works if it goes through but better to be pre-emptive. 7623 if maxMsgSize >= 0 && (len(hdr)+len(msg)) > maxMsgSize { 7624 err := fmt.Errorf("JetStream message size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) 7625 s.RateLimitWarnf(err.Error()) 7626 if canRespond { 7627 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7628 resp.Error = NewJSStreamMessageExceedsMaximumError() 7629 response, _ = json.Marshal(resp) 7630 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7631 } 7632 return err 7633 } 7634 7635 // Some header checks can be checked pre proposal. Most can not. 7636 if len(hdr) > 0 { 7637 // Expected last sequence per subject. 7638 // We can check for last sequence per subject but only if the expected seq <= lseq. 7639 if seq, exists := getExpectedLastSeqPerSubject(hdr); exists && store != nil && seq > 0 && seq <= lseq { 7640 var smv StoreMsg 7641 var fseq uint64 7642 sm, err := store.LoadLastMsg(subject, &smv) 7643 if sm != nil { 7644 fseq = sm.seq 7645 } 7646 if err != nil || fseq != seq { 7647 if canRespond { 7648 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7649 resp.PubAck = &PubAck{Stream: name} 7650 resp.Error = NewJSStreamWrongLastSequenceError(fseq) 7651 b, _ := json.Marshal(resp) 7652 outq.sendMsg(reply, b) 7653 } 7654 return fmt.Errorf("last sequence by subject mismatch: %d vs %d", seq, fseq) 7655 } 7656 } 7657 // Expected stream name can also be pre-checked. 7658 if sname := getExpectedStream(hdr); sname != _EMPTY_ && sname != name { 7659 if canRespond { 7660 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7661 resp.PubAck = &PubAck{Stream: name} 7662 resp.Error = NewJSStreamNotMatchError() 7663 b, _ := json.Marshal(resp) 7664 outq.sendMsg(reply, b) 7665 } 7666 return errors.New("expected stream does not match") 7667 } 7668 } 7669 7670 // Since we encode header len as u16 make sure we do not exceed. 7671 // Again this works if it goes through but better to be pre-emptive. 7672 if len(hdr) > math.MaxUint16 { 7673 err := fmt.Errorf("JetStream header size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) 7674 s.RateLimitWarnf(err.Error()) 7675 if canRespond { 7676 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7677 resp.Error = NewJSStreamHeaderExceedsMaximumError() 7678 response, _ = json.Marshal(resp) 7679 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7680 } 7681 return err 7682 } 7683 7684 // Proceed with proposing this message. 7685 7686 // We only use mset.clseq for clustering and in case we run ahead of actual commits. 7687 // Check if we need to set initial value here 7688 mset.clMu.Lock() 7689 if mset.clseq == 0 || mset.clseq < lseq { 7690 // Re-capture 7691 lseq, clfs = mset.lseq, mset.clfs 7692 mset.clseq = lseq + clfs 7693 } 7694 7695 // Check if we have an interest policy and discard new with max msgs or bytes. 7696 // We need to deny here otherwise it could succeed on some peers and not others 7697 // depending on consumer ack state. So we deny here, if we allow that means we know 7698 // it would succeed on every peer. 7699 if interestPolicy && discard == DiscardNew && (maxMsgs > 0 || maxBytes > 0) { 7700 // Track inflight. 7701 if mset.inflight == nil { 7702 mset.inflight = make(map[uint64]uint64) 7703 } 7704 if mset.cfg.Storage == FileStorage { 7705 mset.inflight[mset.clseq] = fileStoreMsgSize(subject, hdr, msg) 7706 } else { 7707 mset.inflight[mset.clseq] = memStoreMsgSize(subject, hdr, msg) 7708 } 7709 7710 var err error 7711 if maxMsgs > 0 && state.Msgs+uint64(len(mset.inflight)) > uint64(maxMsgs) { 7712 err = ErrMaxMsgs 7713 } else if maxBytes > 0 { 7714 // TODO(dlc) - Could track this rollup independently. 7715 var bytesPending uint64 7716 for _, nb := range mset.inflight { 7717 bytesPending += nb 7718 } 7719 if state.Bytes+bytesPending > uint64(maxBytes) { 7720 err = ErrMaxBytes 7721 } 7722 } 7723 if err != nil { 7724 delete(mset.inflight, mset.clseq) 7725 mset.clMu.Unlock() 7726 if canRespond { 7727 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7728 resp.Error = NewJSStreamStoreFailedError(err, Unless(err)) 7729 response, _ = json.Marshal(resp) 7730 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7731 } 7732 return err 7733 } 7734 } 7735 7736 esm := encodeStreamMsgAllowCompress(subject, reply, hdr, msg, mset.clseq, time.Now().UnixNano(), mset.compressOK) 7737 var mtKey uint64 7738 if mt != nil { 7739 mtKey = mset.clseq 7740 if mset.mt == nil { 7741 mset.mt = make(map[uint64]*msgTrace) 7742 } 7743 mset.mt[mtKey] = mt 7744 } 7745 mset.clseq++ 7746 7747 // Do proposal. 7748 err := node.Propose(esm) 7749 if err != nil && mset.clseq > 0 { 7750 mset.clseq-- 7751 } 7752 7753 // Check to see if we are being overrun. 7754 // TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured. 7755 if mset.clseq-(lseq+clfs) > streamLagWarnThreshold { 7756 lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name) 7757 s.RateLimitWarnf(lerr.Error()) 7758 } 7759 mset.clMu.Unlock() 7760 7761 if err != nil { 7762 if mt != nil { 7763 mset.getAndDeleteMsgTrace(mtKey) 7764 } 7765 if canRespond { 7766 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: mset.cfg.Name}} 7767 resp.Error = &ApiError{Code: 503, Description: err.Error()} 7768 response, _ = json.Marshal(resp) 7769 // If we errored out respond here. 7770 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7771 } 7772 } 7773 7774 if err != nil && isOutOfSpaceErr(err) { 7775 s.handleOutOfSpace(mset) 7776 } 7777 7778 return err 7779 } 7780 7781 func (mset *stream) getAndDeleteMsgTrace(lseq uint64) *msgTrace { 7782 if mset == nil { 7783 return nil 7784 } 7785 mset.clMu.Lock() 7786 mt, ok := mset.mt[lseq] 7787 if ok { 7788 delete(mset.mt, lseq) 7789 } 7790 mset.clMu.Unlock() 7791 return mt 7792 } 7793 7794 // For requesting messages post raft snapshot to catch up streams post server restart. 7795 // Any deleted msgs etc will be handled inline on catchup. 7796 type streamSyncRequest struct { 7797 Peer string `json:"peer,omitempty"` 7798 FirstSeq uint64 `json:"first_seq"` 7799 LastSeq uint64 `json:"last_seq"` 7800 DeleteRangesOk bool `json:"delete_ranges"` 7801 } 7802 7803 // Given a stream state that represents a snapshot, calculate the sync request based on our current state. 7804 func (mset *stream) calculateSyncRequest(state *StreamState, snap *StreamReplicatedState) *streamSyncRequest { 7805 // Quick check if we are already caught up. 7806 if state.LastSeq >= snap.LastSeq { 7807 return nil 7808 } 7809 return &streamSyncRequest{FirstSeq: state.LastSeq + 1, LastSeq: snap.LastSeq, Peer: mset.node.ID(), DeleteRangesOk: true} 7810 } 7811 7812 // processSnapshotDeletes will update our current store based on the snapshot 7813 // but only processing deletes and new FirstSeq / purges. 7814 func (mset *stream) processSnapshotDeletes(snap *StreamReplicatedState) { 7815 mset.mu.Lock() 7816 var state StreamState 7817 mset.store.FastState(&state) 7818 // Always adjust if FirstSeq has moved beyond our state. 7819 var didReset bool 7820 if snap.FirstSeq > state.FirstSeq { 7821 mset.store.Compact(snap.FirstSeq) 7822 mset.store.FastState(&state) 7823 mset.lseq = state.LastSeq 7824 mset.clearAllPreAcksBelowFloor(state.FirstSeq) 7825 didReset = true 7826 } 7827 s := mset.srv 7828 mset.mu.Unlock() 7829 7830 if didReset { 7831 s.Warnf("Catchup for stream '%s > %s' resetting first sequence: %d on catchup request", 7832 mset.account(), mset.name(), snap.FirstSeq) 7833 } 7834 7835 if len(snap.Deleted) > 0 { 7836 mset.store.SyncDeleted(snap.Deleted) 7837 } 7838 } 7839 7840 func (mset *stream) setCatchupPeer(peer string, lag uint64) { 7841 if peer == _EMPTY_ { 7842 return 7843 } 7844 mset.mu.Lock() 7845 if mset.catchups == nil { 7846 mset.catchups = make(map[string]uint64) 7847 } 7848 mset.catchups[peer] = lag 7849 mset.mu.Unlock() 7850 } 7851 7852 // Will decrement by one. 7853 func (mset *stream) updateCatchupPeer(peer string) { 7854 if peer == _EMPTY_ { 7855 return 7856 } 7857 mset.mu.Lock() 7858 if lag := mset.catchups[peer]; lag > 0 { 7859 mset.catchups[peer] = lag - 1 7860 } 7861 mset.mu.Unlock() 7862 } 7863 7864 func (mset *stream) decrementCatchupPeer(peer string, num uint64) { 7865 if peer == _EMPTY_ { 7866 return 7867 } 7868 mset.mu.Lock() 7869 if lag := mset.catchups[peer]; lag > 0 { 7870 if lag >= num { 7871 lag -= num 7872 } else { 7873 lag = 0 7874 } 7875 mset.catchups[peer] = lag 7876 } 7877 mset.mu.Unlock() 7878 } 7879 7880 func (mset *stream) clearCatchupPeer(peer string) { 7881 mset.mu.Lock() 7882 if mset.catchups != nil { 7883 delete(mset.catchups, peer) 7884 } 7885 mset.mu.Unlock() 7886 } 7887 7888 // Lock should be held. 7889 func (mset *stream) clearAllCatchupPeers() { 7890 if mset.catchups != nil { 7891 mset.catchups = nil 7892 } 7893 } 7894 7895 func (mset *stream) lagForCatchupPeer(peer string) uint64 { 7896 mset.mu.RLock() 7897 defer mset.mu.RUnlock() 7898 if mset.catchups == nil { 7899 return 0 7900 } 7901 return mset.catchups[peer] 7902 } 7903 7904 func (mset *stream) hasCatchupPeers() bool { 7905 mset.mu.RLock() 7906 defer mset.mu.RUnlock() 7907 return len(mset.catchups) > 0 7908 } 7909 7910 func (mset *stream) setCatchingUp() { 7911 mset.catchup.Store(true) 7912 } 7913 7914 func (mset *stream) clearCatchingUp() { 7915 mset.catchup.Store(false) 7916 } 7917 7918 func (mset *stream) isCatchingUp() bool { 7919 return mset.catchup.Load() 7920 } 7921 7922 // Determine if a non-leader is current. 7923 // Lock should be held. 7924 func (mset *stream) isCurrent() bool { 7925 if mset.node == nil { 7926 return true 7927 } 7928 return mset.node.Current() && !mset.catchup.Load() 7929 } 7930 7931 // Maximum requests for the whole server that can be in flight at the same time. 7932 const maxConcurrentSyncRequests = 16 7933 7934 var ( 7935 errCatchupCorruptSnapshot = errors.New("corrupt stream snapshot detected") 7936 errCatchupStalled = errors.New("catchup stalled") 7937 errCatchupStreamStopped = errors.New("stream has been stopped") // when a catchup is terminated due to the stream going away. 7938 errCatchupBadMsg = errors.New("bad catchup msg") 7939 errCatchupWrongSeqForSkip = errors.New("wrong sequence for skipped msg") 7940 ) 7941 7942 // Process a stream snapshot. 7943 func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) { 7944 // Update any deletes, etc. 7945 mset.processSnapshotDeletes(snap) 7946 7947 mset.mu.Lock() 7948 var state StreamState 7949 mset.store.FastState(&state) 7950 mset.setCLFS(snap.Failed) 7951 sreq := mset.calculateSyncRequest(&state, snap) 7952 7953 s, js, subject, n, st := mset.srv, mset.js, mset.sa.Sync, mset.node, mset.cfg.Storage 7954 qname := fmt.Sprintf("[ACC:%s] stream '%s' snapshot", mset.acc.Name, mset.cfg.Name) 7955 mset.mu.Unlock() 7956 7957 // Bug that would cause this to be empty on stream update. 7958 if subject == _EMPTY_ { 7959 return errCatchupCorruptSnapshot 7960 } 7961 7962 // Just return if up to date or already exceeded limits. 7963 if sreq == nil || js.limitsExceeded(st) { 7964 return nil 7965 } 7966 7967 // Pause the apply channel for our raft group while we catch up. 7968 if err := n.PauseApply(); err != nil { 7969 return err 7970 } 7971 7972 defer func() { 7973 // Don't bother resuming if server or stream is gone. 7974 if e != errCatchupStreamStopped && e != ErrServerNotRunning { 7975 n.ResumeApply() 7976 } 7977 }() 7978 7979 // Set our catchup state. 7980 mset.setCatchingUp() 7981 defer mset.clearCatchingUp() 7982 7983 var sub *subscription 7984 var err error 7985 7986 const activityInterval = 30 * time.Second 7987 notActive := time.NewTimer(activityInterval) 7988 defer notActive.Stop() 7989 7990 defer func() { 7991 if sub != nil { 7992 s.sysUnsubscribe(sub) 7993 } 7994 // Make sure any consumers are updated for the pending amounts. 7995 mset.mu.Lock() 7996 for _, o := range mset.consumers { 7997 o.mu.Lock() 7998 if o.isLeader() { 7999 o.streamNumPending() 8000 } 8001 o.mu.Unlock() 8002 } 8003 mset.mu.Unlock() 8004 }() 8005 8006 var releaseSem bool 8007 releaseSyncOutSem := func() { 8008 if !releaseSem { 8009 return 8010 } 8011 // Need to use select for the server shutdown case. 8012 select { 8013 case s.syncOutSem <- struct{}{}: 8014 default: 8015 } 8016 releaseSem = false 8017 } 8018 // On exit, we will release our semaphore if we acquired it. 8019 defer releaseSyncOutSem() 8020 8021 // Check our final state when we exit cleanly. 8022 // This will make sure we have interest consumers updated. 8023 checkFinalState := func() { 8024 // Bail if no stream. 8025 if mset == nil { 8026 return 8027 } 8028 mset.mu.RLock() 8029 consumers := make([]*consumer, 0, len(mset.consumers)) 8030 for _, o := range mset.consumers { 8031 consumers = append(consumers, o) 8032 } 8033 mset.mu.RUnlock() 8034 for _, o := range consumers { 8035 o.checkStateForInterestStream() 8036 } 8037 } 8038 8039 // Do not let this go on forever. 8040 const maxRetries = 3 8041 var numRetries int 8042 8043 RETRY: 8044 // On retry, we need to release the semaphore we got. Call will be no-op 8045 // if releaseSem boolean has not been set to true on successfully getting 8046 // the semaphore. 8047 releaseSyncOutSem() 8048 8049 if n.GroupLeader() == _EMPTY_ { 8050 return fmt.Errorf("catchup for stream '%s > %s' aborted, no leader", mset.account(), mset.name()) 8051 } 8052 8053 // If we have a sub clear that here. 8054 if sub != nil { 8055 s.sysUnsubscribe(sub) 8056 sub = nil 8057 } 8058 8059 if !s.isRunning() { 8060 return ErrServerNotRunning 8061 } 8062 8063 numRetries++ 8064 if numRetries >= maxRetries { 8065 // Force a hard reset here. 8066 return errFirstSequenceMismatch 8067 } 8068 8069 // Block here if we have too many requests in flight. 8070 <-s.syncOutSem 8071 releaseSem = true 8072 8073 // We may have been blocked for a bit, so the reset needs to ensure that we 8074 // consume the already fired timer. 8075 if !notActive.Stop() { 8076 select { 8077 case <-notActive.C: 8078 default: 8079 } 8080 } 8081 notActive.Reset(activityInterval) 8082 8083 // Grab sync request again on failures. 8084 if sreq == nil { 8085 mset.mu.RLock() 8086 var state StreamState 8087 mset.store.FastState(&state) 8088 sreq = mset.calculateSyncRequest(&state, snap) 8089 mset.mu.RUnlock() 8090 if sreq == nil { 8091 return nil 8092 } 8093 } 8094 8095 // Used to transfer message from the wire to another Go routine internally. 8096 type im struct { 8097 msg []byte 8098 reply string 8099 } 8100 // This is used to notify the leader that it should stop the runCatchup 8101 // because we are either bailing out or going to retry due to an error. 8102 notifyLeaderStopCatchup := func(mrec *im, err error) { 8103 if mrec.reply == _EMPTY_ { 8104 return 8105 } 8106 s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, err.Error()) 8107 } 8108 8109 msgsQ := newIPQueue[*im](s, qname) 8110 defer msgsQ.unregister() 8111 8112 // Send our catchup request here. 8113 reply := syncReplySubject() 8114 sub, err = s.sysSubscribe(reply, func(_ *subscription, _ *client, _ *Account, _, reply string, msg []byte) { 8115 // Make copy since we are using a buffer from the inbound client/route. 8116 msgsQ.push(&im{copyBytes(msg), reply}) 8117 }) 8118 if err != nil { 8119 s.Errorf("Could not subscribe to stream catchup: %v", err) 8120 goto RETRY 8121 } 8122 8123 // Send our sync request. 8124 b, _ := json.Marshal(sreq) 8125 s.sendInternalMsgLocked(subject, reply, nil, b) 8126 // Remember when we sent this out to avoid loop spins on errors below. 8127 reqSendTime := time.Now() 8128 // Clear our sync request. 8129 sreq = nil 8130 8131 // Run our own select loop here. 8132 for qch, lch := n.QuitC(), n.LeadChangeC(); ; { 8133 select { 8134 case <-msgsQ.ch: 8135 notActive.Reset(activityInterval) 8136 8137 mrecs := msgsQ.pop() 8138 for _, mrec := range mrecs { 8139 msg := mrec.msg 8140 // Check for eof signaling. 8141 if len(msg) == 0 { 8142 msgsQ.recycle(&mrecs) 8143 checkFinalState() 8144 return nil 8145 } 8146 if _, err := mset.processCatchupMsg(msg); err == nil { 8147 if mrec.reply != _EMPTY_ { 8148 s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, nil) 8149 } 8150 } else if isOutOfSpaceErr(err) { 8151 notifyLeaderStopCatchup(mrec, err) 8152 return err 8153 } else if err == NewJSInsufficientResourcesError() { 8154 notifyLeaderStopCatchup(mrec, err) 8155 if mset.js.limitsExceeded(mset.cfg.Storage) { 8156 s.resourcesExceededError() 8157 } else { 8158 s.Warnf("Catchup for stream '%s > %s' errored, account resources exceeded: %v", mset.account(), mset.name(), err) 8159 } 8160 msgsQ.recycle(&mrecs) 8161 return err 8162 } else { 8163 notifyLeaderStopCatchup(mrec, err) 8164 s.Warnf("Catchup for stream '%s > %s' errored, will retry: %v", mset.account(), mset.name(), err) 8165 msgsQ.recycle(&mrecs) 8166 8167 // Make sure we do not spin and make things worse. 8168 const minRetryWait = 2 * time.Second 8169 elapsed := time.Since(reqSendTime) 8170 if elapsed < minRetryWait { 8171 select { 8172 case <-s.quitCh: 8173 return ErrServerNotRunning 8174 case <-qch: 8175 return errCatchupStreamStopped 8176 case <-time.After(minRetryWait - elapsed): 8177 } 8178 } 8179 goto RETRY 8180 } 8181 } 8182 notActive.Reset(activityInterval) 8183 msgsQ.recycle(&mrecs) 8184 case <-notActive.C: 8185 if mrecs := msgsQ.pop(); len(mrecs) > 0 { 8186 mrec := mrecs[0] 8187 notifyLeaderStopCatchup(mrec, errCatchupStalled) 8188 msgsQ.recycle(&mrecs) 8189 } 8190 s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name()) 8191 goto RETRY 8192 case <-s.quitCh: 8193 return ErrServerNotRunning 8194 case <-qch: 8195 return errCatchupStreamStopped 8196 case isLeader := <-lch: 8197 if isLeader { 8198 n.StepDown() 8199 goto RETRY 8200 } 8201 } 8202 } 8203 } 8204 8205 // processCatchupMsg will be called to process out of band catchup msgs from a sync request. 8206 func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { 8207 if len(msg) == 0 { 8208 return 0, errCatchupBadMsg 8209 } 8210 op := entryOp(msg[0]) 8211 if op != streamMsgOp && op != compressedStreamMsgOp && op != deleteRangeOp { 8212 return 0, errCatchupBadMsg 8213 } 8214 8215 mbuf := msg[1:] 8216 if op == deleteRangeOp { 8217 dr, err := decodeDeleteRange(mbuf) 8218 if err != nil { 8219 return 0, errCatchupBadMsg 8220 } 8221 // Handle the delete range. 8222 // Make sure the sequences match up properly. 8223 mset.mu.Lock() 8224 if len(mset.preAcks) > 0 { 8225 for seq := dr.First; seq < dr.First+dr.Num; seq++ { 8226 mset.clearAllPreAcks(seq) 8227 } 8228 } 8229 if err = mset.store.SkipMsgs(dr.First, dr.Num); err != nil { 8230 mset.mu.Unlock() 8231 return 0, errCatchupWrongSeqForSkip 8232 } 8233 mset.lseq = dr.First + dr.Num - 1 8234 lseq := mset.lseq 8235 mset.mu.Unlock() 8236 return lseq, nil 8237 } 8238 8239 if op == compressedStreamMsgOp { 8240 var err error 8241 mbuf, err = s2.Decode(nil, mbuf) 8242 if err != nil { 8243 panic(err.Error()) 8244 } 8245 } 8246 8247 subj, _, hdr, msg, seq, ts, err := decodeStreamMsg(mbuf) 8248 if err != nil { 8249 return 0, errCatchupBadMsg 8250 } 8251 8252 mset.mu.Lock() 8253 st := mset.cfg.Storage 8254 ddloaded := mset.ddloaded 8255 tierName := mset.tier 8256 replicas := mset.cfg.Replicas 8257 8258 if mset.hasAllPreAcks(seq, subj) { 8259 mset.clearAllPreAcks(seq) 8260 // Mark this to be skipped 8261 subj, ts = _EMPTY_, 0 8262 } 8263 mset.mu.Unlock() 8264 8265 if mset.js.limitsExceeded(st) { 8266 return 0, NewJSInsufficientResourcesError() 8267 } else if exceeded, apiErr := mset.jsa.limitsExceeded(st, tierName, replicas); apiErr != nil { 8268 return 0, apiErr 8269 } else if exceeded { 8270 return 0, NewJSInsufficientResourcesError() 8271 } 8272 8273 // Put into our store 8274 // Messages to be skipped have no subject or timestamp. 8275 // TODO(dlc) - formalize with skipMsgOp 8276 if subj == _EMPTY_ && ts == 0 { 8277 if lseq := mset.store.SkipMsg(); lseq != seq { 8278 return 0, errCatchupWrongSeqForSkip 8279 } 8280 } else if err := mset.store.StoreRawMsg(subj, hdr, msg, seq, ts); err != nil { 8281 return 0, err 8282 } 8283 8284 // Update our lseq. 8285 mset.setLastSeq(seq) 8286 8287 // Check for MsgId and if we have one here make sure to update our internal map. 8288 if len(hdr) > 0 { 8289 if msgId := getMsgId(hdr); msgId != _EMPTY_ { 8290 if !ddloaded { 8291 mset.mu.Lock() 8292 mset.rebuildDedupe() 8293 mset.mu.Unlock() 8294 } 8295 mset.storeMsgId(&ddentry{msgId, seq, ts}) 8296 } 8297 } 8298 8299 return seq, nil 8300 } 8301 8302 func (mset *stream) handleClusterSyncRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 8303 var sreq streamSyncRequest 8304 if err := json.Unmarshal(msg, &sreq); err != nil { 8305 // Log error. 8306 return 8307 } 8308 mset.srv.startGoRoutine(func() { mset.runCatchup(reply, &sreq) }) 8309 } 8310 8311 // Lock should be held. 8312 func (js *jetStream) offlineClusterInfo(rg *raftGroup) *ClusterInfo { 8313 s := js.srv 8314 8315 ci := &ClusterInfo{Name: s.ClusterName(), RaftGroup: rg.Name} 8316 for _, peer := range rg.Peers { 8317 if sir, ok := s.nodeToInfo.Load(peer); ok && sir != nil { 8318 si := sir.(nodeInfo) 8319 pi := &PeerInfo{Peer: peer, Name: si.name, Current: false, Offline: true} 8320 ci.Replicas = append(ci.Replicas, pi) 8321 } 8322 } 8323 return ci 8324 } 8325 8326 // clusterInfo will report on the status of the raft group. 8327 func (js *jetStream) clusterInfo(rg *raftGroup) *ClusterInfo { 8328 if js == nil { 8329 return nil 8330 } 8331 js.mu.RLock() 8332 defer js.mu.RUnlock() 8333 8334 s := js.srv 8335 if rg == nil || rg.node == nil { 8336 return &ClusterInfo{ 8337 Name: s.cachedClusterName(), 8338 Leader: s.Name(), 8339 } 8340 } 8341 8342 n := rg.node 8343 ci := &ClusterInfo{ 8344 Name: s.cachedClusterName(), 8345 Leader: s.serverNameForNode(n.GroupLeader()), 8346 RaftGroup: rg.Name, 8347 } 8348 8349 now := time.Now() 8350 id, peers := n.ID(), n.Peers() 8351 8352 // If we are leaderless, do not suppress putting us in the peer list. 8353 if ci.Leader == _EMPTY_ { 8354 id = _EMPTY_ 8355 } 8356 8357 for _, rp := range peers { 8358 if rp.ID != id && rg.isMember(rp.ID) { 8359 var lastSeen time.Duration 8360 if now.After(rp.Last) && rp.Last.Unix() != 0 { 8361 lastSeen = now.Sub(rp.Last) 8362 } 8363 current := rp.Current 8364 if current && lastSeen > lostQuorumInterval { 8365 current = false 8366 } 8367 // Create a peer info with common settings if the peer has not been seen 8368 // yet (which can happen after the whole cluster is stopped and only some 8369 // of the nodes are restarted). 8370 pi := &PeerInfo{ 8371 Current: current, 8372 Offline: true, 8373 Active: lastSeen, 8374 Lag: rp.Lag, 8375 Peer: rp.ID, 8376 } 8377 // If node is found, complete/update the settings. 8378 if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil { 8379 si := sir.(nodeInfo) 8380 pi.Name, pi.Offline, pi.cluster = si.name, si.offline, si.cluster 8381 } else { 8382 // If not, then add a name that indicates that the server name 8383 // is unknown at this time, and clear the lag since it is misleading 8384 // (the node may not have that much lag). 8385 // Note: We return now the Peer ID in PeerInfo, so the "(peerID: %s)" 8386 // would technically not be required, but keeping it for now. 8387 pi.Name, pi.Lag = fmt.Sprintf("Server name unknown at this time (peerID: %s)", rp.ID), 0 8388 } 8389 ci.Replicas = append(ci.Replicas, pi) 8390 } 8391 } 8392 // Order the result based on the name so that we get something consistent 8393 // when doing repeated stream info in the CLI, etc... 8394 sort.Slice(ci.Replicas, func(i, j int) bool { 8395 return ci.Replicas[i].Name < ci.Replicas[j].Name 8396 }) 8397 return ci 8398 } 8399 8400 func (mset *stream) checkClusterInfo(ci *ClusterInfo) { 8401 for _, r := range ci.Replicas { 8402 peer := getHash(r.Name) 8403 if lag := mset.lagForCatchupPeer(peer); lag > 0 { 8404 r.Current = false 8405 r.Lag = lag 8406 } 8407 } 8408 } 8409 8410 // Return a list of alternates, ranked by preference order to the request, of stream mirrors. 8411 // This allows clients to select or get more information about read replicas that could be a 8412 // better option to connect to versus the original source. 8413 func (js *jetStream) streamAlternates(ci *ClientInfo, stream string) []StreamAlternate { 8414 if js == nil { 8415 return nil 8416 } 8417 8418 js.mu.RLock() 8419 defer js.mu.RUnlock() 8420 8421 s, cc := js.srv, js.cluster 8422 // Track our domain. 8423 domain := s.getOpts().JetStreamDomain 8424 8425 // No clustering just return nil. 8426 if cc == nil { 8427 return nil 8428 } 8429 acc, _ := s.LookupAccount(ci.serviceAccount()) 8430 if acc == nil { 8431 return nil 8432 } 8433 8434 // Collect our ordering first for clusters. 8435 weights := make(map[string]int) 8436 all := []string{ci.Cluster} 8437 all = append(all, ci.Alternates...) 8438 8439 for i := 0; i < len(all); i++ { 8440 weights[all[i]] = len(all) - i 8441 } 8442 8443 var alts []StreamAlternate 8444 for _, sa := range cc.streams[acc.Name] { 8445 // Add in ourselves and any mirrors. 8446 if sa.Config.Name == stream || (sa.Config.Mirror != nil && sa.Config.Mirror.Name == stream) { 8447 alts = append(alts, StreamAlternate{Name: sa.Config.Name, Domain: domain, Cluster: sa.Group.Cluster}) 8448 } 8449 } 8450 // If just us don't fill in. 8451 if len(alts) == 1 { 8452 return nil 8453 } 8454 8455 // Sort based on our weights that originate from the request itself. 8456 sort.Slice(alts, func(i, j int) bool { 8457 return weights[alts[i].Cluster] > weights[alts[j].Cluster] 8458 }) 8459 8460 return alts 8461 } 8462 8463 // Internal request for stream info, this is coming on the wire so do not block here. 8464 func (mset *stream) handleClusterStreamInfoRequest(_ *subscription, c *client, _ *Account, subject, reply string, _ []byte) { 8465 go mset.processClusterStreamInfoRequest(reply) 8466 } 8467 8468 func (mset *stream) processClusterStreamInfoRequest(reply string) { 8469 mset.mu.RLock() 8470 sysc, js, sa, config := mset.sysc, mset.srv.js.Load(), mset.sa, mset.cfg 8471 isLeader := mset.isLeader() 8472 mset.mu.RUnlock() 8473 8474 // By design all members will receive this. Normally we only want the leader answering. 8475 // But if we have stalled and lost quorom all can respond. 8476 if sa != nil && !js.isGroupLeaderless(sa.Group) && !isLeader { 8477 return 8478 } 8479 8480 // If we are not the leader let someone else possibly respond first. 8481 if !isLeader { 8482 time.Sleep(500 * time.Millisecond) 8483 } 8484 8485 si := &StreamInfo{ 8486 Created: mset.createdTime(), 8487 State: mset.state(), 8488 Config: config, 8489 Cluster: js.clusterInfo(mset.raftGroup()), 8490 Sources: mset.sourcesInfo(), 8491 Mirror: mset.mirrorInfo(), 8492 TimeStamp: time.Now().UTC(), 8493 } 8494 8495 // Check for out of band catchups. 8496 if mset.hasCatchupPeers() { 8497 mset.checkClusterInfo(si.Cluster) 8498 } 8499 8500 sysc.sendInternalMsg(reply, _EMPTY_, nil, si) 8501 } 8502 8503 // 64MB for now, for the total server. This is max we will blast out if asked to 8504 // do so to another server for purposes of catchups. 8505 // This number should be ok on 1Gbit interface. 8506 const defaultMaxTotalCatchupOutBytes = int64(64 * 1024 * 1024) 8507 8508 // Current total outstanding catchup bytes. 8509 func (s *Server) gcbTotal() int64 { 8510 s.gcbMu.RLock() 8511 defer s.gcbMu.RUnlock() 8512 return s.gcbOut 8513 } 8514 8515 // Returns true if Current total outstanding catchup bytes is below 8516 // the maximum configured. 8517 func (s *Server) gcbBelowMax() bool { 8518 s.gcbMu.RLock() 8519 defer s.gcbMu.RUnlock() 8520 return s.gcbOut <= s.gcbOutMax 8521 } 8522 8523 // Adds `sz` to the server's total outstanding catchup bytes and to `localsz` 8524 // under the gcbMu lock. The `localsz` points to the local outstanding catchup 8525 // bytes of the runCatchup go routine of a given stream. 8526 func (s *Server) gcbAdd(localsz *int64, sz int64) { 8527 s.gcbMu.Lock() 8528 atomic.AddInt64(localsz, sz) 8529 s.gcbOut += sz 8530 if s.gcbOut >= s.gcbOutMax && s.gcbKick == nil { 8531 s.gcbKick = make(chan struct{}) 8532 } 8533 s.gcbMu.Unlock() 8534 } 8535 8536 // Removes `sz` from the server's total outstanding catchup bytes and from 8537 // `localsz`, but only if `localsz` is non 0, which would signal that gcSubLast 8538 // has already been invoked. See that function for details. 8539 // Must be invoked under the gcbMu lock. 8540 func (s *Server) gcbSubLocked(localsz *int64, sz int64) { 8541 if atomic.LoadInt64(localsz) == 0 { 8542 return 8543 } 8544 atomic.AddInt64(localsz, -sz) 8545 s.gcbOut -= sz 8546 if s.gcbKick != nil && s.gcbOut < s.gcbOutMax { 8547 close(s.gcbKick) 8548 s.gcbKick = nil 8549 } 8550 } 8551 8552 // Locked version of gcbSubLocked() 8553 func (s *Server) gcbSub(localsz *int64, sz int64) { 8554 s.gcbMu.Lock() 8555 s.gcbSubLocked(localsz, sz) 8556 s.gcbMu.Unlock() 8557 } 8558 8559 // Similar to gcbSub() but reset `localsz` to 0 at the end under the gcbMu lock. 8560 // This will signal further calls to gcbSub() for this `localsz` pointer that 8561 // nothing should be done because runCatchup() has exited and any remaining 8562 // outstanding bytes value has already been decremented. 8563 func (s *Server) gcbSubLast(localsz *int64) { 8564 s.gcbMu.Lock() 8565 s.gcbSubLocked(localsz, *localsz) 8566 *localsz = 0 8567 s.gcbMu.Unlock() 8568 } 8569 8570 // Returns our kick chan, or nil if it does not exist. 8571 func (s *Server) cbKickChan() <-chan struct{} { 8572 s.gcbMu.RLock() 8573 defer s.gcbMu.RUnlock() 8574 return s.gcbKick 8575 } 8576 8577 func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { 8578 s := mset.srv 8579 defer s.grWG.Done() 8580 8581 const maxOutBytes = int64(64 * 1024 * 1024) // 64MB for now, these are all internal, from server to server 8582 const maxOutMsgs = int32(256 * 1024) // 256k in case we have lots of small messages or skip msgs. 8583 outb := int64(0) 8584 outm := int32(0) 8585 8586 // On abnormal exit make sure to update global total. 8587 defer s.gcbSubLast(&outb) 8588 8589 // Flow control processing. 8590 ackReplySize := func(subj string) int64 { 8591 if li := strings.LastIndexByte(subj, btsep); li > 0 && li < len(subj) { 8592 return parseAckReplyNum(subj[li+1:]) 8593 } 8594 return 0 8595 } 8596 8597 nextBatchC := make(chan struct{}, 1) 8598 nextBatchC <- struct{}{} 8599 remoteQuitCh := make(chan struct{}) 8600 8601 const activityInterval = 30 * time.Second 8602 notActive := time.NewTimer(activityInterval) 8603 defer notActive.Stop() 8604 8605 // Setup ackReply for flow control. 8606 ackReply := syncAckSubject() 8607 ackSub, _ := s.sysSubscribe(ackReply, func(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 8608 if len(msg) > 0 { 8609 s.Warnf("Catchup for stream '%s > %s' was aborted on the remote due to: %q", 8610 mset.account(), mset.name(), msg) 8611 s.sysUnsubscribe(sub) 8612 close(remoteQuitCh) 8613 return 8614 } 8615 sz := ackReplySize(subject) 8616 s.gcbSub(&outb, sz) 8617 atomic.AddInt32(&outm, -1) 8618 mset.updateCatchupPeer(sreq.Peer) 8619 // Kick ourselves and anyone else who might have stalled on global state. 8620 select { 8621 case nextBatchC <- struct{}{}: 8622 // Reset our activity 8623 notActive.Reset(activityInterval) 8624 default: 8625 } 8626 }) 8627 defer s.sysUnsubscribe(ackSub) 8628 ackReplyT := strings.ReplaceAll(ackReply, ".*", ".%d") 8629 8630 // Grab our state. 8631 var state StreamState 8632 mset.mu.RLock() 8633 mset.store.FastState(&state) 8634 mset.mu.RUnlock() 8635 8636 // Reset notion of first if this request wants sequences before our starting sequence 8637 // and we would have nothing to send. If we have partial messages still need to send skips for those. 8638 // We will keep sreq's first sequence to not create sequence mismatches on the follower, but we extend the last to our current state. 8639 if sreq.FirstSeq < state.FirstSeq && state.FirstSeq > sreq.LastSeq { 8640 s.Debugf("Catchup for stream '%s > %s' resetting request first sequence from %d to %d", 8641 mset.account(), mset.name(), sreq.FirstSeq, state.FirstSeq) 8642 if state.LastSeq > sreq.LastSeq { 8643 sreq.LastSeq = state.LastSeq 8644 } 8645 } 8646 8647 // Setup sequences to walk through. 8648 seq, last := sreq.FirstSeq, sreq.LastSeq 8649 mset.setCatchupPeer(sreq.Peer, last-seq) 8650 8651 // Check if we can compress during this. 8652 compressOk := mset.compressAllowed() 8653 8654 var spb int 8655 const minWait = 5 * time.Second 8656 8657 sendNextBatchAndContinue := func(qch chan struct{}) bool { 8658 // Check if we know we will not enter the loop because we are done. 8659 if seq > last { 8660 s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) 8661 // EOF 8662 s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) 8663 return false 8664 } 8665 8666 // If we already sent a batch, we will try to make sure we can at least send a minimum 8667 // batch before sending the next batch. 8668 if spb > 0 { 8669 // Wait til we can send at least 4k 8670 const minBatchWait = int32(4 * 1024) 8671 mw := time.NewTimer(minWait) 8672 for done := false; !done; { 8673 select { 8674 case <-nextBatchC: 8675 done = maxOutMsgs-atomic.LoadInt32(&outm) > minBatchWait 8676 if !done { 8677 // Wait for a small bit. 8678 time.Sleep(50 * time.Millisecond) 8679 } else { 8680 // GC friendly. 8681 mw.Stop() 8682 } 8683 case <-mw.C: 8684 done = true 8685 case <-s.quitCh: 8686 return false 8687 case <-qch: 8688 return false 8689 case <-remoteQuitCh: 8690 return false 8691 } 8692 } 8693 spb = 0 8694 } 8695 8696 // Send an encoded msg. 8697 sendEM := func(em []byte) { 8698 // Place size in reply subject for flow control. 8699 l := int64(len(em)) 8700 reply := fmt.Sprintf(ackReplyT, l) 8701 s.gcbAdd(&outb, l) 8702 atomic.AddInt32(&outm, 1) 8703 s.sendInternalMsgLocked(sendSubject, reply, nil, em) 8704 spb++ 8705 } 8706 8707 // If we support gap markers. 8708 var dr DeleteRange 8709 drOk := sreq.DeleteRangesOk 8710 8711 // Will send our delete range. 8712 // Should already be checked for being valid. 8713 sendDR := func() { 8714 if dr.Num == 1 { 8715 // Send like a normal skip msg. 8716 sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, dr.First, 0)) 8717 } else { 8718 // We have a run, send a gap record. We send these without reply or tracking. 8719 s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, encodeDeleteRange(&dr)) 8720 // Clear out the pending for catchup. 8721 mset.decrementCatchupPeer(sreq.Peer, dr.Num) 8722 } 8723 // Reset always. 8724 dr.First, dr.Num = 0, 0 8725 } 8726 8727 var smv StoreMsg 8728 for ; seq <= last && atomic.LoadInt64(&outb) <= maxOutBytes && atomic.LoadInt32(&outm) <= maxOutMsgs && s.gcbBelowMax(); seq++ { 8729 sm, err := mset.store.LoadMsg(seq, &smv) 8730 // if this is not a deleted msg, bail out. 8731 if err != nil && err != ErrStoreMsgNotFound && err != errDeletedMsg { 8732 if err == ErrStoreEOF { 8733 var state StreamState 8734 mset.store.FastState(&state) 8735 if seq > state.LastSeq { 8736 // The snapshot has a larger last sequence then we have. This could be due to a truncation 8737 // when trying to recover after corruption, still not 100% sure. Could be off by 1 too somehow, 8738 // but tested a ton of those with no success. 8739 s.Warnf("Catchup for stream '%s > %s' completed, but requested sequence %d was larger then current state: %+v", 8740 mset.account(), mset.name(), seq, state) 8741 // Try our best to redo our invalidated snapshot as well. 8742 if n := mset.raftNode(); n != nil { 8743 n.InstallSnapshot(mset.stateSnapshot()) 8744 } 8745 // Signal EOF 8746 s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) 8747 return false 8748 } 8749 } 8750 s.Warnf("Error loading message for catchup '%s > %s': %v", mset.account(), mset.name(), err) 8751 return false 8752 } 8753 8754 if sm != nil { 8755 // If we allow gap markers check if we have one pending. 8756 if drOk && dr.First > 0 { 8757 sendDR() 8758 } 8759 // Send the normal message now. 8760 sendEM(encodeStreamMsgAllowCompress(sm.subj, _EMPTY_, sm.hdr, sm.msg, sm.seq, sm.ts, compressOk)) 8761 } else { 8762 if drOk { 8763 if dr.First == 0 { 8764 dr.First, dr.Num = seq, 1 8765 } else { 8766 dr.Num++ 8767 } 8768 } else { 8769 // Skip record for deleted msg. 8770 sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq, 0)) 8771 } 8772 } 8773 8774 // Check if we are done. 8775 if seq == last { 8776 // Need to see if we have a pending delete range. 8777 if drOk && dr.First > 0 { 8778 sendDR() 8779 } 8780 // Check for a condition where our state's first is now past the last that we could have sent. 8781 // If so reset last and continue sending. 8782 var state StreamState 8783 mset.mu.RLock() 8784 mset.store.FastState(&state) 8785 mset.mu.RUnlock() 8786 if last < state.FirstSeq { 8787 last = state.LastSeq 8788 } 8789 // Recheck our exit condition. 8790 if seq == last { 8791 s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) 8792 // EOF 8793 s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) 8794 return false 8795 } 8796 } 8797 select { 8798 case <-remoteQuitCh: 8799 return false 8800 default: 8801 } 8802 } 8803 if drOk && dr.First > 0 { 8804 sendDR() 8805 } 8806 8807 return true 8808 } 8809 8810 // Check is this stream got closed. 8811 mset.mu.RLock() 8812 qch := mset.qch 8813 mset.mu.RUnlock() 8814 if qch == nil { 8815 return 8816 } 8817 8818 // Run as long as we are still active and need catchup. 8819 // FIXME(dlc) - Purge event? Stream delete? 8820 for { 8821 // Get this each time, will be non-nil if globally blocked and we will close to wake everyone up. 8822 cbKick := s.cbKickChan() 8823 8824 select { 8825 case <-s.quitCh: 8826 return 8827 case <-qch: 8828 return 8829 case <-remoteQuitCh: 8830 mset.clearCatchupPeer(sreq.Peer) 8831 return 8832 case <-notActive.C: 8833 s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name()) 8834 mset.clearCatchupPeer(sreq.Peer) 8835 return 8836 case <-nextBatchC: 8837 if !sendNextBatchAndContinue(qch) { 8838 mset.clearCatchupPeer(sreq.Peer) 8839 return 8840 } 8841 case <-cbKick: 8842 if !sendNextBatchAndContinue(qch) { 8843 mset.clearCatchupPeer(sreq.Peer) 8844 return 8845 } 8846 } 8847 } 8848 } 8849 8850 const jscAllSubj = "$JSC.>" 8851 8852 func syncSubjForStream() string { 8853 return syncSubject("$JSC.SYNC") 8854 } 8855 8856 func syncReplySubject() string { 8857 return syncSubject("$JSC.R") 8858 } 8859 8860 func infoReplySubject() string { 8861 return syncSubject("$JSC.R") 8862 } 8863 8864 func syncAckSubject() string { 8865 return syncSubject("$JSC.ACK") + ".*" 8866 } 8867 8868 func syncSubject(pre string) string { 8869 var sb strings.Builder 8870 sb.WriteString(pre) 8871 sb.WriteByte(btsep) 8872 8873 var b [replySuffixLen]byte 8874 rn := rand.Int63() 8875 for i, l := 0, rn; i < len(b); i++ { 8876 b[i] = digits[l%base] 8877 l /= base 8878 } 8879 8880 sb.Write(b[:]) 8881 return sb.String() 8882 } 8883 8884 const ( 8885 clusterStreamInfoT = "$JSC.SI.%s.%s" 8886 clusterConsumerInfoT = "$JSC.CI.%s.%s.%s" 8887 jsaUpdatesSubT = "$JSC.ARU.%s.*" 8888 jsaUpdatesPubT = "$JSC.ARU.%s.%s" 8889 )