github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/jetstream_cluster.go (about) 1 // Copyright 2020-2024 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package server 15 16 import ( 17 "bytes" 18 crand "crypto/rand" 19 "encoding/binary" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "math" 24 "math/rand" 25 "os" 26 "path/filepath" 27 "reflect" 28 "sort" 29 "strconv" 30 "strings" 31 "sync/atomic" 32 "time" 33 34 "github.com/klauspost/compress/s2" 35 "github.com/minio/highwayhash" 36 "github.com/nats-io/nuid" 37 ) 38 39 // jetStreamCluster holds information about the meta group and stream assignments. 40 type jetStreamCluster struct { 41 // The metacontroller raftNode. 42 meta RaftNode 43 // For stream and consumer assignments. All servers will have this be the same. 44 // ACCOUNT -> STREAM -> Stream Assignment -> Consumers 45 streams map[string]map[string]*streamAssignment 46 // These are inflight proposals and used to apply limits when there are 47 // concurrent requests that would otherwise be accepted. 48 // We also record the group for the stream. This is needed since if we have 49 // concurrent requests for same account and stream we need to let it process to get 50 // a response but they need to be same group, peers etc. 51 inflight map[string]map[string]*raftGroup 52 // Signals meta-leader should check the stream assignments. 53 streamsCheck bool 54 // Server. 55 s *Server 56 // Internal client. 57 c *client 58 // Processing assignment results. 59 streamResults *subscription 60 consumerResults *subscription 61 // System level request to have the leader stepdown. 62 stepdown *subscription 63 // System level requests to remove a peer. 64 peerRemove *subscription 65 // System level request to move a stream 66 peerStreamMove *subscription 67 // System level request to cancel a stream move 68 peerStreamCancelMove *subscription 69 // To pop out the monitorCluster before the raft layer. 70 qch chan struct{} 71 } 72 73 // Used to guide placement of streams and meta controllers in clustered JetStream. 74 type Placement struct { 75 Cluster string `json:"cluster,omitempty"` 76 Tags []string `json:"tags,omitempty"` 77 } 78 79 // Define types of the entry. 80 type entryOp uint8 81 82 // ONLY ADD TO THE END, DO NOT INSERT IN BETWEEN WILL BREAK SERVER INTEROP. 83 const ( 84 // Meta ops. 85 assignStreamOp entryOp = iota 86 assignConsumerOp 87 removeStreamOp 88 removeConsumerOp 89 // Stream ops. 90 streamMsgOp 91 purgeStreamOp 92 deleteMsgOp 93 // Consumer ops. 94 updateDeliveredOp 95 updateAcksOp 96 // Compressed consumer assignments. 97 assignCompressedConsumerOp 98 // Filtered Consumer skip. 99 updateSkipOp 100 // Update Stream. 101 updateStreamOp 102 // For updating information on pending pull requests. 103 addPendingRequest 104 removePendingRequest 105 // For sending compressed streams, either through RAFT or catchup. 106 compressedStreamMsgOp 107 // For sending deleted gaps on catchups for replicas. 108 deleteRangeOp 109 ) 110 111 // raftGroups are controlled by the metagroup controller. 112 // The raftGroups will house streams and consumers. 113 type raftGroup struct { 114 Name string `json:"name"` 115 Peers []string `json:"peers"` 116 Storage StorageType `json:"store"` 117 Cluster string `json:"cluster,omitempty"` 118 Preferred string `json:"preferred,omitempty"` 119 // Internal 120 node RaftNode 121 } 122 123 // streamAssignment is what the meta controller uses to assign streams to peers. 124 type streamAssignment struct { 125 Client *ClientInfo `json:"client,omitempty"` 126 Created time.Time `json:"created"` 127 Config *StreamConfig `json:"stream"` 128 Group *raftGroup `json:"group"` 129 Sync string `json:"sync"` 130 Subject string `json:"subject"` 131 Reply string `json:"reply"` 132 Restore *StreamState `json:"restore_state,omitempty"` 133 // Internal 134 consumers map[string]*consumerAssignment 135 responded bool 136 recovering bool 137 err error 138 } 139 140 // consumerAssignment is what the meta controller uses to assign consumers to streams. 141 type consumerAssignment struct { 142 Client *ClientInfo `json:"client,omitempty"` 143 Created time.Time `json:"created"` 144 Name string `json:"name"` 145 Stream string `json:"stream"` 146 Config *ConsumerConfig `json:"consumer"` 147 Group *raftGroup `json:"group"` 148 Subject string `json:"subject"` 149 Reply string `json:"reply"` 150 State *ConsumerState `json:"state,omitempty"` 151 // Internal 152 responded bool 153 recovering bool 154 deleted bool 155 err error 156 } 157 158 // streamPurge is what the stream leader will replicate when purging a stream. 159 type streamPurge struct { 160 Client *ClientInfo `json:"client,omitempty"` 161 Stream string `json:"stream"` 162 LastSeq uint64 `json:"last_seq"` 163 Subject string `json:"subject"` 164 Reply string `json:"reply"` 165 Request *JSApiStreamPurgeRequest `json:"request,omitempty"` 166 } 167 168 // streamMsgDelete is what the stream leader will replicate when deleting a message. 169 type streamMsgDelete struct { 170 Client *ClientInfo `json:"client,omitempty"` 171 Stream string `json:"stream"` 172 Seq uint64 `json:"seq"` 173 NoErase bool `json:"no_erase,omitempty"` 174 Subject string `json:"subject"` 175 Reply string `json:"reply"` 176 } 177 178 const ( 179 defaultStoreDirName = "_js_" 180 defaultMetaGroupName = "_meta_" 181 defaultMetaFSBlkSize = 1024 * 1024 182 jsExcludePlacement = "!jetstream" 183 ) 184 185 // Returns information useful in mixed mode. 186 func (s *Server) trackedJetStreamServers() (js, total int) { 187 s.mu.RLock() 188 defer s.mu.RUnlock() 189 if !s.isRunning() || !s.eventsEnabled() { 190 return -1, -1 191 } 192 s.nodeToInfo.Range(func(k, v any) bool { 193 si := v.(nodeInfo) 194 if si.js { 195 js++ 196 } 197 total++ 198 return true 199 }) 200 return js, total 201 } 202 203 func (s *Server) getJetStreamCluster() (*jetStream, *jetStreamCluster) { 204 if s.isShuttingDown() { 205 return nil, nil 206 } 207 208 js := s.getJetStream() 209 if js == nil { 210 return nil, nil 211 } 212 213 // Only set once, do not need a lock. 214 return js, js.cluster 215 } 216 217 func (s *Server) JetStreamIsClustered() bool { 218 js := s.getJetStream() 219 if js == nil { 220 return false 221 } 222 return js.isClustered() 223 } 224 225 func (s *Server) JetStreamIsLeader() bool { 226 return s.isMetaLeader.Load() 227 } 228 229 func (s *Server) JetStreamIsCurrent() bool { 230 js := s.getJetStream() 231 if js == nil { 232 return false 233 } 234 // Grab what we need and release js lock. 235 js.mu.RLock() 236 var meta RaftNode 237 cc := js.cluster 238 if cc != nil { 239 meta = cc.meta 240 } 241 js.mu.RUnlock() 242 243 if cc == nil { 244 // Non-clustered mode 245 return true 246 } 247 return meta.Current() 248 } 249 250 func (s *Server) JetStreamSnapshotMeta() error { 251 js := s.getJetStream() 252 if js == nil { 253 return NewJSNotEnabledError() 254 } 255 js.mu.RLock() 256 cc := js.cluster 257 isLeader := cc.isLeader() 258 meta := cc.meta 259 js.mu.RUnlock() 260 261 if !isLeader { 262 return errNotLeader 263 } 264 265 return meta.InstallSnapshot(js.metaSnapshot()) 266 } 267 268 func (s *Server) JetStreamStepdownStream(account, stream string) error { 269 js, cc := s.getJetStreamCluster() 270 if js == nil { 271 return NewJSNotEnabledError() 272 } 273 if cc == nil { 274 return NewJSClusterNotActiveError() 275 } 276 // Grab account 277 acc, err := s.LookupAccount(account) 278 if err != nil { 279 return err 280 } 281 // Grab stream 282 mset, err := acc.lookupStream(stream) 283 if err != nil { 284 return err 285 } 286 287 if node := mset.raftNode(); node != nil && node.Leader() { 288 node.StepDown() 289 } 290 291 return nil 292 } 293 294 func (s *Server) JetStreamStepdownConsumer(account, stream, consumer string) error { 295 js, cc := s.getJetStreamCluster() 296 if js == nil { 297 return NewJSNotEnabledError() 298 } 299 if cc == nil { 300 return NewJSClusterNotActiveError() 301 } 302 // Grab account 303 acc, err := s.LookupAccount(account) 304 if err != nil { 305 return err 306 } 307 // Grab stream 308 mset, err := acc.lookupStream(stream) 309 if err != nil { 310 return err 311 } 312 313 o := mset.lookupConsumer(consumer) 314 if o == nil { 315 return NewJSConsumerNotFoundError() 316 } 317 318 if node := o.raftNode(); node != nil && node.Leader() { 319 node.StepDown() 320 } 321 322 return nil 323 } 324 325 func (s *Server) JetStreamSnapshotStream(account, stream string) error { 326 js, cc := s.getJetStreamCluster() 327 if js == nil { 328 return NewJSNotEnabledForAccountError() 329 } 330 if cc == nil { 331 return NewJSClusterNotActiveError() 332 } 333 // Grab account 334 acc, err := s.LookupAccount(account) 335 if err != nil { 336 return err 337 } 338 // Grab stream 339 mset, err := acc.lookupStream(stream) 340 if err != nil { 341 return err 342 } 343 344 // Hold lock when installing snapshot. 345 mset.mu.Lock() 346 if mset.node == nil { 347 mset.mu.Unlock() 348 return nil 349 } 350 err = mset.node.InstallSnapshot(mset.stateSnapshotLocked()) 351 mset.mu.Unlock() 352 353 return err 354 } 355 356 func (s *Server) JetStreamClusterPeers() []string { 357 js := s.getJetStream() 358 if js == nil { 359 return nil 360 } 361 js.mu.RLock() 362 defer js.mu.RUnlock() 363 364 cc := js.cluster 365 if !cc.isLeader() || cc.meta == nil { 366 return nil 367 } 368 peers := cc.meta.Peers() 369 var nodes []string 370 for _, p := range peers { 371 si, ok := s.nodeToInfo.Load(p.ID) 372 if !ok || si == nil { 373 continue 374 } 375 ni := si.(nodeInfo) 376 // Ignore if offline, no JS, or no current stats have been received. 377 if ni.offline || !ni.js || ni.stats == nil { 378 continue 379 } 380 nodes = append(nodes, si.(nodeInfo).name) 381 } 382 return nodes 383 } 384 385 // Read lock should be held. 386 func (cc *jetStreamCluster) isLeader() bool { 387 if cc == nil { 388 // Non-clustered mode 389 return true 390 } 391 return cc.meta != nil && cc.meta.Leader() 392 } 393 394 // isStreamCurrent will determine if the stream is up to date. 395 // For R1 it will make sure the stream is present on this server. 396 // Read lock should be held. 397 func (cc *jetStreamCluster) isStreamCurrent(account, stream string) bool { 398 if cc == nil { 399 // Non-clustered mode 400 return true 401 } 402 as := cc.streams[account] 403 if as == nil { 404 return false 405 } 406 sa := as[stream] 407 if sa == nil { 408 return false 409 } 410 rg := sa.Group 411 if rg == nil { 412 return false 413 } 414 415 if rg.node == nil || rg.node.Current() { 416 // Check if we are processing a snapshot and are catching up. 417 acc, err := cc.s.LookupAccount(account) 418 if err != nil { 419 return false 420 } 421 mset, err := acc.lookupStream(stream) 422 if err != nil { 423 return false 424 } 425 if mset.isCatchingUp() { 426 return false 427 } 428 // Success. 429 return true 430 } 431 432 return false 433 } 434 435 // Restart the stream in question. 436 // Should only be called when the stream is known to be in a bad state. 437 func (js *jetStream) restartStream(acc *Account, csa *streamAssignment) { 438 js.mu.Lock() 439 s, cc := js.srv, js.cluster 440 if cc == nil { 441 js.mu.Unlock() 442 return 443 } 444 // Need to lookup the one directly from the meta layer, what we get handed is a copy if coming from isStreamHealthy. 445 asa := cc.streams[acc.Name] 446 if asa == nil { 447 js.mu.Unlock() 448 return 449 } 450 sa := asa[csa.Config.Name] 451 if sa == nil { 452 js.mu.Unlock() 453 return 454 } 455 // Make sure to clear out the raft node if still present in the meta layer. 456 if rg := sa.Group; rg != nil && rg.node != nil { 457 if rg.node.State() != Closed { 458 rg.node.Stop() 459 } 460 rg.node = nil 461 } 462 sinceCreation := time.Since(sa.Created) 463 js.mu.Unlock() 464 465 // Process stream assignment to recreate. 466 // Check that we have given system enough time to start us up. 467 // This will be longer than obvious, and matches consumer logic in case system very busy. 468 if sinceCreation < 10*time.Second { 469 s.Debugf("Not restarting missing stream '%s > %s', too soon since creation %v", 470 acc, csa.Config.Name, sinceCreation) 471 return 472 } 473 474 js.processStreamAssignment(sa) 475 476 // If we had consumers assigned to this server they will be present in the copy, csa. 477 // They also need to be processed. The csa consumers is a copy of only our consumers, 478 // those assigned to us, but the consumer assignment's there are direct from the meta 479 // layer to make this part much easier and avoid excessive lookups. 480 for _, cca := range csa.consumers { 481 if cca.deleted { 482 continue 483 } 484 // Need to look up original as well here to make sure node is nil. 485 js.mu.Lock() 486 ca := sa.consumers[cca.Name] 487 if ca != nil && ca.Group != nil { 488 // Make sure the node is stopped if still running. 489 if node := ca.Group.node; node != nil && node.State() != Closed { 490 node.Stop() 491 } 492 // Make sure node is wiped. 493 ca.Group.node = nil 494 } 495 js.mu.Unlock() 496 if ca != nil { 497 js.processConsumerAssignment(ca) 498 } 499 } 500 } 501 502 // isStreamHealthy will determine if the stream is up to date or very close. 503 // For R1 it will make sure the stream is present on this server. 504 func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool { 505 js.mu.RLock() 506 s, cc := js.srv, js.cluster 507 if cc == nil { 508 // Non-clustered mode 509 js.mu.RUnlock() 510 return true 511 } 512 513 // Pull the group out. 514 rg := sa.Group 515 if rg == nil { 516 js.mu.RUnlock() 517 return false 518 } 519 520 streamName := sa.Config.Name 521 node := rg.node 522 js.mu.RUnlock() 523 524 // First lookup stream and make sure its there. 525 mset, err := acc.lookupStream(streamName) 526 if err != nil { 527 js.restartStream(acc, sa) 528 return false 529 } 530 531 // If we are catching up return false. 532 if mset.isCatchingUp() { 533 return false 534 } 535 536 if node == nil || node.Healthy() { 537 // Check if we are processing a snapshot and are catching up. 538 if !mset.isCatchingUp() { 539 return true 540 } 541 } else { // node != nil 542 if node != mset.raftNode() { 543 s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName) 544 node.Delete() 545 mset.resetClusteredState(nil) 546 } else if node.State() == Closed { 547 js.restartStream(acc, sa) 548 } 549 } 550 551 return false 552 } 553 554 // isConsumerHealthy will determine if the consumer is up to date. 555 // For R1 it will make sure the consunmer is present on this server. 556 func (js *jetStream) isConsumerHealthy(mset *stream, consumer string, ca *consumerAssignment) bool { 557 if mset == nil { 558 return false 559 } 560 561 js.mu.RLock() 562 cc := js.cluster 563 if cc == nil { 564 // Non-clustered mode 565 js.mu.RUnlock() 566 return true 567 } 568 // These are required. 569 if ca == nil || ca.Group == nil { 570 js.mu.RUnlock() 571 return false 572 } 573 s := js.srv 574 js.mu.RUnlock() 575 576 // Capture RAFT node from assignment. 577 node := ca.Group.node 578 579 // When we try to restart we nil out the node if applicable 580 // and reprocess the consumer assignment. 581 restartConsumer := func() { 582 mset.mu.RLock() 583 accName, streamName := mset.acc.GetName(), mset.cfg.Name 584 mset.mu.RUnlock() 585 586 js.mu.Lock() 587 deleted := ca.deleted 588 // Check that we have not just been created. 589 if !deleted && time.Since(ca.Created) < 10*time.Second { 590 s.Debugf("Not restarting missing consumer '%s > %s > %s', too soon since creation %v", 591 accName, streamName, consumer, time.Since(ca.Created)) 592 js.mu.Unlock() 593 return 594 } 595 // Make sure the node is stopped if still running. 596 if node != nil && node.State() != Closed { 597 node.Stop() 598 } 599 ca.Group.node = nil 600 js.mu.Unlock() 601 if !deleted { 602 js.processConsumerAssignment(ca) 603 } 604 } 605 606 // Check if not running at all. 607 o := mset.lookupConsumer(consumer) 608 if o == nil { 609 restartConsumer() 610 return false 611 } 612 613 // Check RAFT node state. 614 if node == nil || node.Healthy() { 615 return true 616 } else if node != nil { 617 if node != o.raftNode() { 618 mset.mu.RLock() 619 accName, streamName := mset.acc.GetName(), mset.cfg.Name 620 mset.mu.RUnlock() 621 s.Warnf("Detected consumer cluster node skew '%s > %s > %s'", accName, streamName, consumer) 622 node.Delete() 623 o.deleteWithoutAdvisory() 624 restartConsumer() 625 } else if node.State() == Closed { 626 // We have a consumer, and it should have a running node but it is closed. 627 o.stop() 628 restartConsumer() 629 } 630 } 631 return false 632 } 633 634 // subjectsOverlap checks all existing stream assignments for the account cross-cluster for subject overlap 635 // Use only for clustered JetStream 636 // Read lock should be held. 637 func (jsc *jetStreamCluster) subjectsOverlap(acc string, subjects []string, osa *streamAssignment) bool { 638 asa := jsc.streams[acc] 639 for _, sa := range asa { 640 // can't overlap yourself, assume osa pre-checked for deep equal if passed 641 if osa != nil && sa == osa { 642 continue 643 } 644 for _, subj := range sa.Config.Subjects { 645 for _, tsubj := range subjects { 646 if SubjectsCollide(tsubj, subj) { 647 return true 648 } 649 } 650 } 651 } 652 return false 653 } 654 655 func (a *Account) getJetStreamFromAccount() (*Server, *jetStream, *jsAccount) { 656 a.mu.RLock() 657 jsa := a.js 658 a.mu.RUnlock() 659 if jsa == nil { 660 return nil, nil, nil 661 } 662 jsa.mu.RLock() 663 js := jsa.js 664 jsa.mu.RUnlock() 665 if js == nil { 666 return nil, nil, nil 667 } 668 // Lock not needed, set on creation. 669 s := js.srv 670 return s, js, jsa 671 } 672 673 func (s *Server) JetStreamIsStreamLeader(account, stream string) bool { 674 js, cc := s.getJetStreamCluster() 675 if js == nil || cc == nil { 676 return false 677 } 678 js.mu.RLock() 679 defer js.mu.RUnlock() 680 return cc.isStreamLeader(account, stream) 681 } 682 683 func (a *Account) JetStreamIsStreamLeader(stream string) bool { 684 s, js, jsa := a.getJetStreamFromAccount() 685 if s == nil || js == nil || jsa == nil { 686 return false 687 } 688 js.mu.RLock() 689 defer js.mu.RUnlock() 690 return js.cluster.isStreamLeader(a.Name, stream) 691 } 692 693 func (s *Server) JetStreamIsStreamCurrent(account, stream string) bool { 694 js, cc := s.getJetStreamCluster() 695 if js == nil { 696 return false 697 } 698 js.mu.RLock() 699 defer js.mu.RUnlock() 700 return cc.isStreamCurrent(account, stream) 701 } 702 703 func (a *Account) JetStreamIsConsumerLeader(stream, consumer string) bool { 704 s, js, jsa := a.getJetStreamFromAccount() 705 if s == nil || js == nil || jsa == nil { 706 return false 707 } 708 js.mu.RLock() 709 defer js.mu.RUnlock() 710 return js.cluster.isConsumerLeader(a.Name, stream, consumer) 711 } 712 713 func (s *Server) JetStreamIsConsumerLeader(account, stream, consumer string) bool { 714 js, cc := s.getJetStreamCluster() 715 if js == nil || cc == nil { 716 return false 717 } 718 js.mu.RLock() 719 defer js.mu.RUnlock() 720 return cc.isConsumerLeader(account, stream, consumer) 721 } 722 723 func (s *Server) enableJetStreamClustering() error { 724 if !s.isRunning() { 725 return nil 726 } 727 js := s.getJetStream() 728 if js == nil { 729 return NewJSNotEnabledForAccountError() 730 } 731 // Already set. 732 if js.cluster != nil { 733 return nil 734 } 735 736 s.Noticef("Starting JetStream cluster") 737 // We need to determine if we have a stable cluster name and expected number of servers. 738 s.Debugf("JetStream cluster checking for stable cluster name and peers") 739 740 hasLeafNodeSystemShare := s.canExtendOtherDomain() 741 if s.isClusterNameDynamic() && !hasLeafNodeSystemShare { 742 return errors.New("JetStream cluster requires cluster name") 743 } 744 if s.configuredRoutes() == 0 && !hasLeafNodeSystemShare { 745 return errors.New("JetStream cluster requires configured routes or solicited leafnode for the system account") 746 } 747 748 return js.setupMetaGroup() 749 } 750 751 // isClustered returns if we are clustered. 752 // Lock should not be held. 753 func (js *jetStream) isClustered() bool { 754 // This is only ever set, no need for lock here. 755 return js.cluster != nil 756 } 757 758 // isClusteredNoLock returns if we are clustered, but unlike isClustered() does 759 // not use the jetstream's lock, instead, uses an atomic operation. 760 // There are situations where some code wants to know if we are clustered but 761 // can't use js.isClustered() without causing a lock inversion. 762 func (js *jetStream) isClusteredNoLock() bool { 763 return atomic.LoadInt32(&js.clustered) == 1 764 } 765 766 func (js *jetStream) setupMetaGroup() error { 767 s := js.srv 768 s.Noticef("Creating JetStream metadata controller") 769 770 // Setup our WAL for the metagroup. 771 sysAcc := s.SystemAccount() 772 storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, defaultMetaGroupName) 773 774 fs, err := newFileStoreWithCreated( 775 FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMetaFSBlkSize, AsyncFlush: false, srv: s}, 776 StreamConfig{Name: defaultMetaGroupName, Storage: FileStorage}, 777 time.Now().UTC(), 778 s.jsKeyGen(s.getOpts().JetStreamKey, defaultMetaGroupName), 779 s.jsKeyGen(s.getOpts().JetStreamOldKey, defaultMetaGroupName), 780 ) 781 if err != nil { 782 s.Errorf("Error creating filestore: %v", err) 783 return err 784 } 785 786 cfg := &RaftConfig{Name: defaultMetaGroupName, Store: storeDir, Log: fs} 787 788 // If we are soliciting leafnode connections and we are sharing a system account and do not disable it with a hint, 789 // we want to move to observer mode so that we extend the solicited cluster or supercluster but do not form our own. 790 cfg.Observer = s.canExtendOtherDomain() && s.getOpts().JetStreamExtHint != jsNoExtend 791 792 var bootstrap bool 793 if ps, err := readPeerState(storeDir); err != nil { 794 s.Noticef("JetStream cluster bootstrapping") 795 bootstrap = true 796 peers := s.ActivePeers() 797 s.Debugf("JetStream cluster initial peers: %+v", peers) 798 if err := s.bootstrapRaftNode(cfg, peers, false); err != nil { 799 return err 800 } 801 if cfg.Observer { 802 s.Noticef("Turning JetStream metadata controller Observer Mode on") 803 } 804 } else { 805 s.Noticef("JetStream cluster recovering state") 806 // correlate the value of observer with observations from a previous run. 807 if cfg.Observer { 808 switch ps.domainExt { 809 case extExtended: 810 s.Noticef("Keeping JetStream metadata controller Observer Mode on - due to previous contact") 811 case extNotExtended: 812 s.Noticef("Turning JetStream metadata controller Observer Mode off - due to previous contact") 813 cfg.Observer = false 814 case extUndetermined: 815 s.Noticef("Turning JetStream metadata controller Observer Mode on - no previous contact") 816 s.Noticef("In cases where JetStream will not be extended") 817 s.Noticef("and waiting for leader election until first contact is not acceptable,") 818 s.Noticef(`manually disable Observer Mode by setting the JetStream Option "extension_hint: %s"`, jsNoExtend) 819 } 820 } else { 821 // To track possible configuration changes, responsible for an altered value of cfg.Observer, 822 // set extension state to undetermined. 823 ps.domainExt = extUndetermined 824 if err := writePeerState(storeDir, ps); err != nil { 825 return err 826 } 827 } 828 } 829 830 // Start up our meta node. 831 n, err := s.startRaftNode(sysAcc.GetName(), cfg, pprofLabels{ 832 "type": "metaleader", 833 "account": sysAcc.Name, 834 }) 835 if err != nil { 836 s.Warnf("Could not start metadata controller: %v", err) 837 return err 838 } 839 840 // If we are bootstrapped with no state, start campaign early. 841 if bootstrap { 842 n.Campaign() 843 } 844 845 c := s.createInternalJetStreamClient() 846 sacc := s.SystemAccount() 847 848 js.mu.Lock() 849 defer js.mu.Unlock() 850 js.cluster = &jetStreamCluster{ 851 meta: n, 852 streams: make(map[string]map[string]*streamAssignment), 853 s: s, 854 c: c, 855 qch: make(chan struct{}), 856 } 857 atomic.StoreInt32(&js.clustered, 1) 858 c.registerWithAccount(sacc) 859 860 js.srv.startGoRoutine( 861 js.monitorCluster, 862 pprofLabels{ 863 "type": "metaleader", 864 "account": sacc.Name, 865 }, 866 ) 867 return nil 868 } 869 870 func (js *jetStream) getMetaGroup() RaftNode { 871 js.mu.RLock() 872 defer js.mu.RUnlock() 873 if js.cluster == nil { 874 return nil 875 } 876 return js.cluster.meta 877 } 878 879 func (js *jetStream) server() *Server { 880 // Lock not needed, only set once on creation. 881 return js.srv 882 } 883 884 // Will respond if we do not think we have a metacontroller leader. 885 func (js *jetStream) isLeaderless() bool { 886 js.mu.RLock() 887 defer js.mu.RUnlock() 888 889 cc := js.cluster 890 if cc == nil || cc.meta == nil { 891 return false 892 } 893 // If we don't have a leader. 894 // Make sure we have been running for enough time. 895 if cc.meta.GroupLeader() == _EMPTY_ && time.Since(cc.meta.Created()) > lostQuorumIntervalDefault { 896 return true 897 } 898 return false 899 } 900 901 // Will respond iff we are a member and we know we have no leader. 902 func (js *jetStream) isGroupLeaderless(rg *raftGroup) bool { 903 if rg == nil || js == nil { 904 return false 905 } 906 js.mu.RLock() 907 defer js.mu.RUnlock() 908 909 cc := js.cluster 910 911 // If we are not a member we can not say.. 912 if cc.meta == nil { 913 return false 914 } 915 if !rg.isMember(cc.meta.ID()) { 916 return false 917 } 918 // Single peer groups always have a leader if we are here. 919 if rg.node == nil { 920 return false 921 } 922 // If we don't have a leader. 923 if rg.node.GroupLeader() == _EMPTY_ { 924 // Threshold for jetstream startup. 925 const startupThreshold = 10 * time.Second 926 927 if rg.node.HadPreviousLeader() { 928 // Make sure we have been running long enough to intelligently determine this. 929 if time.Since(js.started) > startupThreshold { 930 return true 931 } 932 } 933 // Make sure we have been running for enough time. 934 if time.Since(rg.node.Created()) > lostQuorumIntervalDefault { 935 return true 936 } 937 } 938 939 return false 940 } 941 942 func (s *Server) JetStreamIsStreamAssigned(account, stream string) bool { 943 js, cc := s.getJetStreamCluster() 944 if js == nil || cc == nil { 945 return false 946 } 947 acc, _ := s.LookupAccount(account) 948 if acc == nil { 949 return false 950 } 951 js.mu.RLock() 952 assigned := cc.isStreamAssigned(acc, stream) 953 js.mu.RUnlock() 954 return assigned 955 } 956 957 // streamAssigned informs us if this server has this stream assigned. 958 func (jsa *jsAccount) streamAssigned(stream string) bool { 959 jsa.mu.RLock() 960 js, acc := jsa.js, jsa.account 961 jsa.mu.RUnlock() 962 963 if js == nil { 964 return false 965 } 966 js.mu.RLock() 967 assigned := js.cluster.isStreamAssigned(acc, stream) 968 js.mu.RUnlock() 969 return assigned 970 } 971 972 // Read lock should be held. 973 func (cc *jetStreamCluster) isStreamAssigned(a *Account, stream string) bool { 974 // Non-clustered mode always return true. 975 if cc == nil { 976 return true 977 } 978 if cc.meta == nil { 979 return false 980 } 981 as := cc.streams[a.Name] 982 if as == nil { 983 return false 984 } 985 sa := as[stream] 986 if sa == nil { 987 return false 988 } 989 rg := sa.Group 990 if rg == nil { 991 return false 992 } 993 // Check if we are the leader of this raftGroup assigned to the stream. 994 ourID := cc.meta.ID() 995 for _, peer := range rg.Peers { 996 if peer == ourID { 997 return true 998 } 999 } 1000 return false 1001 } 1002 1003 // Read lock should be held. 1004 func (cc *jetStreamCluster) isStreamLeader(account, stream string) bool { 1005 // Non-clustered mode always return true. 1006 if cc == nil { 1007 return true 1008 } 1009 if cc.meta == nil { 1010 return false 1011 } 1012 1013 var sa *streamAssignment 1014 if as := cc.streams[account]; as != nil { 1015 sa = as[stream] 1016 } 1017 if sa == nil { 1018 return false 1019 } 1020 rg := sa.Group 1021 if rg == nil { 1022 return false 1023 } 1024 // Check if we are the leader of this raftGroup assigned to the stream. 1025 ourID := cc.meta.ID() 1026 for _, peer := range rg.Peers { 1027 if peer == ourID { 1028 if len(rg.Peers) == 1 || rg.node != nil && rg.node.Leader() { 1029 return true 1030 } 1031 } 1032 } 1033 return false 1034 } 1035 1036 // Read lock should be held. 1037 func (cc *jetStreamCluster) isConsumerLeader(account, stream, consumer string) bool { 1038 // Non-clustered mode always return true. 1039 if cc == nil { 1040 return true 1041 } 1042 if cc.meta == nil { 1043 return false 1044 } 1045 1046 var sa *streamAssignment 1047 if as := cc.streams[account]; as != nil { 1048 sa = as[stream] 1049 } 1050 if sa == nil { 1051 return false 1052 } 1053 // Check if we are the leader of this raftGroup assigned to this consumer. 1054 ca := sa.consumers[consumer] 1055 if ca == nil { 1056 return false 1057 } 1058 rg := ca.Group 1059 ourID := cc.meta.ID() 1060 for _, peer := range rg.Peers { 1061 if peer == ourID { 1062 if len(rg.Peers) == 1 || (rg.node != nil && rg.node.Leader()) { 1063 return true 1064 } 1065 } 1066 } 1067 return false 1068 } 1069 1070 // Remove the stream `streamName` for the account `accName` from the inflight 1071 // proposals map. This is done on success (processStreamAssignment) or on 1072 // failure (processStreamAssignmentResults). 1073 // (Write) Lock held on entry. 1074 func (cc *jetStreamCluster) removeInflightProposal(accName, streamName string) { 1075 streams, ok := cc.inflight[accName] 1076 if !ok { 1077 return 1078 } 1079 delete(streams, streamName) 1080 if len(streams) == 0 { 1081 delete(cc.inflight, accName) 1082 } 1083 } 1084 1085 // Return the cluster quit chan. 1086 func (js *jetStream) clusterQuitC() chan struct{} { 1087 js.mu.RLock() 1088 defer js.mu.RUnlock() 1089 if js.cluster != nil { 1090 return js.cluster.qch 1091 } 1092 return nil 1093 } 1094 1095 // Mark that the meta layer is recovering. 1096 func (js *jetStream) setMetaRecovering() { 1097 js.mu.Lock() 1098 defer js.mu.Unlock() 1099 if js.cluster != nil { 1100 // metaRecovering 1101 js.metaRecovering = true 1102 } 1103 } 1104 1105 // Mark that the meta layer is no longer recovering. 1106 func (js *jetStream) clearMetaRecovering() { 1107 js.mu.Lock() 1108 defer js.mu.Unlock() 1109 js.metaRecovering = false 1110 } 1111 1112 // Return whether the meta layer is recovering. 1113 func (js *jetStream) isMetaRecovering() bool { 1114 js.mu.RLock() 1115 defer js.mu.RUnlock() 1116 return js.metaRecovering 1117 } 1118 1119 // During recovery track any stream and consumer delete and update operations. 1120 type recoveryUpdates struct { 1121 removeStreams map[string]*streamAssignment 1122 removeConsumers map[string]*consumerAssignment 1123 updateStreams map[string]*streamAssignment 1124 updateConsumers map[string]*consumerAssignment 1125 } 1126 1127 // Called after recovery of the cluster on startup to check for any orphans. 1128 // Streams and consumers are recovered from disk, and the meta layer's mappings 1129 // should clean them up, but under crash scenarios there could be orphans. 1130 func (js *jetStream) checkForOrphans() { 1131 consumerName := func(o *consumer) string { 1132 o.mu.RLock() 1133 defer o.mu.RUnlock() 1134 return o.name 1135 } 1136 1137 // Can not hold jetstream lock while trying to delete streams or consumers. 1138 js.mu.Lock() 1139 s, cc := js.srv, js.cluster 1140 s.Debugf("JetStream cluster checking for orphans") 1141 1142 var streams []*stream 1143 var consumers []*consumer 1144 1145 for accName, jsa := range js.accounts { 1146 asa := cc.streams[accName] 1147 jsa.mu.RLock() 1148 for stream, mset := range jsa.streams { 1149 if sa := asa[stream]; sa == nil { 1150 streams = append(streams, mset) 1151 } else { 1152 // This one is good, check consumers now. 1153 for _, o := range mset.getConsumers() { 1154 consumer := consumerName(o) 1155 if sa.consumers[consumer] == nil { 1156 consumers = append(consumers, o) 1157 } 1158 } 1159 } 1160 } 1161 jsa.mu.RUnlock() 1162 } 1163 js.mu.Unlock() 1164 1165 for _, mset := range streams { 1166 mset.mu.RLock() 1167 accName, stream := mset.acc.Name, mset.cfg.Name 1168 mset.mu.RUnlock() 1169 s.Warnf("Detected orphaned stream '%s > %s', will cleanup", accName, stream) 1170 if err := mset.delete(); err != nil { 1171 s.Warnf("Deleting stream encountered an error: %v", err) 1172 } 1173 } 1174 for _, o := range consumers { 1175 o.mu.RLock() 1176 accName, mset, consumer := o.acc.Name, o.mset, o.name 1177 o.mu.RUnlock() 1178 stream := "N/A" 1179 if mset != nil { 1180 mset.mu.RLock() 1181 stream = mset.cfg.Name 1182 mset.mu.RUnlock() 1183 } 1184 s.Warnf("Detected orphaned consumer '%s > %s > %s', will cleanup", accName, stream, consumer) 1185 if err := o.delete(); err != nil { 1186 s.Warnf("Deleting consumer encountered an error: %v", err) 1187 } 1188 } 1189 } 1190 1191 // Check and delete any orphans we may come across. 1192 func (s *Server) checkForNRGOrphans() { 1193 js, cc := s.getJetStreamCluster() 1194 if js == nil || cc == nil || js.isMetaRecovering() { 1195 // No cluster means no NRGs. Also return if still recovering. 1196 return 1197 } 1198 1199 // Track which assets R>1 should be on this server. 1200 nrgMap := make(map[string]struct{}) 1201 trackGroup := func(rg *raftGroup) { 1202 // If R>1 track this as a legit NRG. 1203 if rg.node != nil { 1204 nrgMap[rg.Name] = struct{}{} 1205 } 1206 } 1207 // Register our meta. 1208 js.mu.RLock() 1209 meta := cc.meta 1210 if meta == nil { 1211 js.mu.RUnlock() 1212 // Bail with no meta node. 1213 return 1214 } 1215 1216 ourID := meta.ID() 1217 nrgMap[meta.Group()] = struct{}{} 1218 1219 // Collect all valid groups from our assignments. 1220 for _, asa := range cc.streams { 1221 for _, sa := range asa { 1222 if sa.Group.isMember(ourID) && sa.Restore == nil { 1223 trackGroup(sa.Group) 1224 for _, ca := range sa.consumers { 1225 if ca.Group.isMember(ourID) { 1226 trackGroup(ca.Group) 1227 } 1228 } 1229 } 1230 } 1231 } 1232 js.mu.RUnlock() 1233 1234 // Check NRGs that are running. 1235 var needDelete []RaftNode 1236 s.rnMu.RLock() 1237 for name, n := range s.raftNodes { 1238 if _, ok := nrgMap[name]; !ok { 1239 needDelete = append(needDelete, n) 1240 } 1241 } 1242 s.rnMu.RUnlock() 1243 1244 for _, n := range needDelete { 1245 s.Warnf("Detected orphaned NRG %q, will cleanup", n.Group()) 1246 n.Delete() 1247 } 1248 } 1249 1250 func (js *jetStream) monitorCluster() { 1251 s, n := js.server(), js.getMetaGroup() 1252 qch, rqch, lch, aq := js.clusterQuitC(), n.QuitC(), n.LeadChangeC(), n.ApplyQ() 1253 1254 defer s.grWG.Done() 1255 1256 s.Debugf("Starting metadata monitor") 1257 defer s.Debugf("Exiting metadata monitor") 1258 1259 // Make sure to stop the raft group on exit to prevent accidental memory bloat. 1260 defer n.Stop() 1261 defer s.isMetaLeader.Store(false) 1262 1263 const compactInterval = time.Minute 1264 t := time.NewTicker(compactInterval) 1265 defer t.Stop() 1266 1267 // Used to check cold boot cluster when possibly in mixed mode. 1268 const leaderCheckInterval = time.Second 1269 lt := time.NewTicker(leaderCheckInterval) 1270 defer lt.Stop() 1271 1272 // Check the general health once an hour. 1273 const healthCheckInterval = 1 * time.Hour 1274 ht := time.NewTicker(healthCheckInterval) 1275 defer ht.Stop() 1276 1277 // Utility to check health. 1278 checkHealth := func() { 1279 if hs := s.healthz(nil); hs.Error != _EMPTY_ { 1280 s.Warnf("%v", hs.Error) 1281 } 1282 // Also check for orphaned NRGs. 1283 s.checkForNRGOrphans() 1284 } 1285 1286 var ( 1287 isLeader bool 1288 lastSnapTime time.Time 1289 compactSizeMin = uint64(8 * 1024 * 1024) // 8MB 1290 minSnapDelta = 10 * time.Second 1291 ) 1292 1293 // Highwayhash key for generating hashes. 1294 key := make([]byte, 32) 1295 crand.Read(key) 1296 1297 // Set to true to start. 1298 js.setMetaRecovering() 1299 1300 // Snapshotting function. 1301 doSnapshot := func() { 1302 // Suppress during recovery. 1303 if js.isMetaRecovering() { 1304 return 1305 } 1306 // For the meta layer we want to snapshot when asked if we need one or have any entries that we can compact. 1307 if ne, _ := n.Size(); ne > 0 || n.NeedSnapshot() { 1308 if err := n.InstallSnapshot(js.metaSnapshot()); err == nil { 1309 lastSnapTime = time.Now() 1310 } else if err != errNoSnapAvailable && err != errNodeClosed { 1311 s.Warnf("Error snapshotting JetStream cluster state: %v", err) 1312 } 1313 } 1314 } 1315 1316 ru := &recoveryUpdates{ 1317 removeStreams: make(map[string]*streamAssignment), 1318 removeConsumers: make(map[string]*consumerAssignment), 1319 updateStreams: make(map[string]*streamAssignment), 1320 updateConsumers: make(map[string]*consumerAssignment), 1321 } 1322 1323 for { 1324 select { 1325 case <-s.quitCh: 1326 return 1327 case <-rqch: 1328 return 1329 case <-qch: 1330 // Clean signal from shutdown routine so do best effort attempt to snapshot meta layer. 1331 doSnapshot() 1332 // Return the signal back since shutdown will be waiting. 1333 close(qch) 1334 return 1335 case <-aq.ch: 1336 ces := aq.pop() 1337 for _, ce := range ces { 1338 if ce == nil { 1339 // Signals we have replayed all of our metadata. 1340 js.clearMetaRecovering() 1341 // Process any removes that are still valid after recovery. 1342 for _, ca := range ru.removeConsumers { 1343 js.processConsumerRemoval(ca) 1344 } 1345 for _, sa := range ru.removeStreams { 1346 js.processStreamRemoval(sa) 1347 } 1348 // Process pending updates. 1349 for _, sa := range ru.updateStreams { 1350 js.processUpdateStreamAssignment(sa) 1351 } 1352 // Now consumers. 1353 for _, ca := range ru.updateConsumers { 1354 js.processConsumerAssignment(ca) 1355 } 1356 // Clear. 1357 ru = nil 1358 s.Debugf("Recovered JetStream cluster metadata") 1359 js.checkForOrphans() 1360 // Do a health check here as well. 1361 go checkHealth() 1362 continue 1363 } 1364 if didSnap, didStreamRemoval, didConsumerRemoval, err := js.applyMetaEntries(ce.Entries, ru); err == nil { 1365 _, nb := n.Applied(ce.Index) 1366 if js.hasPeerEntries(ce.Entries) || didStreamRemoval || (didSnap && !isLeader) { 1367 doSnapshot() 1368 } else if didConsumerRemoval && time.Since(lastSnapTime) > minSnapDelta/2 { 1369 doSnapshot() 1370 } else if nb > compactSizeMin && time.Since(lastSnapTime) > minSnapDelta { 1371 doSnapshot() 1372 } 1373 ce.ReturnToPool() 1374 } else { 1375 s.Warnf("Error applying JetStream cluster entries: %v", err) 1376 } 1377 } 1378 aq.recycle(&ces) 1379 1380 case isLeader = <-lch: 1381 // For meta layer synchronize everyone to our state on becoming leader. 1382 if isLeader && n.ApplyQ().len() == 0 { 1383 n.SendSnapshot(js.metaSnapshot()) 1384 } 1385 // Process the change. 1386 js.processLeaderChange(isLeader) 1387 if isLeader { 1388 s.sendInternalMsgLocked(serverStatsPingReqSubj, _EMPTY_, nil, nil) 1389 // Install a snapshot as we become leader. 1390 js.checkClusterSize() 1391 doSnapshot() 1392 } 1393 1394 case <-t.C: 1395 doSnapshot() 1396 // Periodically check the cluster size. 1397 if n.Leader() { 1398 js.checkClusterSize() 1399 } 1400 case <-ht.C: 1401 // Do this in a separate go routine. 1402 go checkHealth() 1403 1404 case <-lt.C: 1405 s.Debugf("Checking JetStream cluster state") 1406 // If we have a current leader or had one in the past we can cancel this here since the metaleader 1407 // will be in charge of all peer state changes. 1408 // For cold boot only. 1409 if n.GroupLeader() != _EMPTY_ || n.HadPreviousLeader() { 1410 lt.Stop() 1411 continue 1412 } 1413 // If we are here we do not have a leader and we did not have a previous one, so cold start. 1414 // Check to see if we can adjust our cluster size down iff we are in mixed mode and we have 1415 // seen a total that is what our original estimate was. 1416 cs := n.ClusterSize() 1417 if js, total := s.trackedJetStreamServers(); js < total && total >= cs && js != cs { 1418 s.Noticef("Adjusting JetStream expected peer set size to %d from original %d", js, cs) 1419 n.AdjustBootClusterSize(js) 1420 } 1421 } 1422 } 1423 } 1424 1425 // This is called on first leader transition to double check the peers and cluster set size. 1426 func (js *jetStream) checkClusterSize() { 1427 s, n := js.server(), js.getMetaGroup() 1428 if n == nil { 1429 return 1430 } 1431 // We will check that we have a correct cluster set size by checking for any non-js servers 1432 // which can happen in mixed mode. 1433 ps := n.(*raft).currentPeerState() 1434 if len(ps.knownPeers) >= ps.clusterSize { 1435 return 1436 } 1437 1438 // Grab our active peers. 1439 peers := s.ActivePeers() 1440 1441 // If we have not registered all of our peers yet we can't do 1442 // any adjustments based on a mixed mode. We will periodically check back. 1443 if len(peers) < ps.clusterSize { 1444 return 1445 } 1446 1447 s.Debugf("Checking JetStream cluster size") 1448 1449 // If we are here our known set as the leader is not the same as the cluster size. 1450 // Check to see if we have a mixed mode setup. 1451 var totalJS int 1452 for _, p := range peers { 1453 if si, ok := s.nodeToInfo.Load(p); ok && si != nil { 1454 if si.(nodeInfo).js { 1455 totalJS++ 1456 } 1457 } 1458 } 1459 // If we have less then our cluster size adjust that here. Can not do individual peer removals since 1460 // they will not be in the tracked peers. 1461 if totalJS < ps.clusterSize { 1462 s.Debugf("Adjusting JetStream cluster size from %d to %d", ps.clusterSize, totalJS) 1463 if err := n.AdjustClusterSize(totalJS); err != nil { 1464 s.Warnf("Error adjusting JetStream cluster size: %v", err) 1465 } 1466 } 1467 } 1468 1469 // Represents our stable meta state that we can write out. 1470 type writeableStreamAssignment struct { 1471 Client *ClientInfo `json:"client,omitempty"` 1472 Created time.Time `json:"created"` 1473 Config *StreamConfig `json:"stream"` 1474 Group *raftGroup `json:"group"` 1475 Sync string `json:"sync"` 1476 Consumers []*consumerAssignment 1477 } 1478 1479 func (js *jetStream) clusterStreamConfig(accName, streamName string) (StreamConfig, bool) { 1480 js.mu.RLock() 1481 defer js.mu.RUnlock() 1482 if sa, ok := js.cluster.streams[accName][streamName]; ok { 1483 return *sa.Config, true 1484 } 1485 return StreamConfig{}, false 1486 } 1487 1488 func (js *jetStream) metaSnapshot() []byte { 1489 js.mu.RLock() 1490 cc := js.cluster 1491 nsa := 0 1492 for _, asa := range cc.streams { 1493 nsa += len(asa) 1494 } 1495 streams := make([]writeableStreamAssignment, 0, nsa) 1496 for _, asa := range cc.streams { 1497 for _, sa := range asa { 1498 wsa := writeableStreamAssignment{ 1499 Client: sa.Client, 1500 Created: sa.Created, 1501 Config: sa.Config, 1502 Group: sa.Group, 1503 Sync: sa.Sync, 1504 Consumers: make([]*consumerAssignment, 0, len(sa.consumers)), 1505 } 1506 for _, ca := range sa.consumers { 1507 wsa.Consumers = append(wsa.Consumers, ca) 1508 } 1509 streams = append(streams, wsa) 1510 } 1511 } 1512 1513 if len(streams) == 0 { 1514 js.mu.RUnlock() 1515 return nil 1516 } 1517 1518 b, _ := json.Marshal(streams) 1519 js.mu.RUnlock() 1520 1521 return s2.EncodeBetter(nil, b) 1522 } 1523 1524 func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecovering bool) error { 1525 var wsas []writeableStreamAssignment 1526 if len(buf) > 0 { 1527 jse, err := s2.Decode(nil, buf) 1528 if err != nil { 1529 return err 1530 } 1531 if err = json.Unmarshal(jse, &wsas); err != nil { 1532 return err 1533 } 1534 } 1535 1536 // Build our new version here outside of js. 1537 streams := make(map[string]map[string]*streamAssignment) 1538 for _, wsa := range wsas { 1539 fixCfgMirrorWithDedupWindow(wsa.Config) 1540 as := streams[wsa.Client.serviceAccount()] 1541 if as == nil { 1542 as = make(map[string]*streamAssignment) 1543 streams[wsa.Client.serviceAccount()] = as 1544 } 1545 sa := &streamAssignment{Client: wsa.Client, Created: wsa.Created, Config: wsa.Config, Group: wsa.Group, Sync: wsa.Sync} 1546 if len(wsa.Consumers) > 0 { 1547 sa.consumers = make(map[string]*consumerAssignment) 1548 for _, ca := range wsa.Consumers { 1549 sa.consumers[ca.Name] = ca 1550 } 1551 } 1552 as[wsa.Config.Name] = sa 1553 } 1554 1555 js.mu.Lock() 1556 cc := js.cluster 1557 1558 var saAdd, saDel, saChk []*streamAssignment 1559 // Walk through the old list to generate the delete list. 1560 for account, asa := range cc.streams { 1561 nasa := streams[account] 1562 for sn, sa := range asa { 1563 if nsa := nasa[sn]; nsa == nil { 1564 saDel = append(saDel, sa) 1565 } else { 1566 saChk = append(saChk, nsa) 1567 } 1568 } 1569 } 1570 // Walk through the new list to generate the add list. 1571 for account, nasa := range streams { 1572 asa := cc.streams[account] 1573 for sn, sa := range nasa { 1574 if asa[sn] == nil { 1575 saAdd = append(saAdd, sa) 1576 } 1577 } 1578 } 1579 // Now walk the ones to check and process consumers. 1580 var caAdd, caDel []*consumerAssignment 1581 for _, sa := range saChk { 1582 // Make sure to add in all the new ones from sa. 1583 for _, ca := range sa.consumers { 1584 caAdd = append(caAdd, ca) 1585 } 1586 if osa := js.streamAssignment(sa.Client.serviceAccount(), sa.Config.Name); osa != nil { 1587 for _, ca := range osa.consumers { 1588 if sa.consumers[ca.Name] == nil { 1589 caDel = append(caDel, ca) 1590 } else { 1591 caAdd = append(caAdd, ca) 1592 } 1593 } 1594 } 1595 } 1596 js.mu.Unlock() 1597 1598 // Do removals first. 1599 for _, sa := range saDel { 1600 js.setStreamAssignmentRecovering(sa) 1601 if isRecovering { 1602 key := sa.recoveryKey() 1603 ru.removeStreams[key] = sa 1604 delete(ru.updateStreams, key) 1605 } else { 1606 js.processStreamRemoval(sa) 1607 } 1608 } 1609 // Now do add for the streams. Also add in all consumers. 1610 for _, sa := range saAdd { 1611 js.setStreamAssignmentRecovering(sa) 1612 js.processStreamAssignment(sa) 1613 1614 // We can simply process the consumers. 1615 for _, ca := range sa.consumers { 1616 js.setConsumerAssignmentRecovering(ca) 1617 js.processConsumerAssignment(ca) 1618 } 1619 } 1620 1621 // Perform updates on those in saChk. These were existing so make 1622 // sure to process any changes. 1623 for _, sa := range saChk { 1624 js.setStreamAssignmentRecovering(sa) 1625 if isRecovering { 1626 key := sa.recoveryKey() 1627 ru.updateStreams[key] = sa 1628 delete(ru.removeStreams, key) 1629 } else { 1630 js.processUpdateStreamAssignment(sa) 1631 } 1632 } 1633 1634 // Now do the deltas for existing stream's consumers. 1635 for _, ca := range caDel { 1636 js.setConsumerAssignmentRecovering(ca) 1637 if isRecovering { 1638 key := ca.recoveryKey() 1639 ru.removeConsumers[key] = ca 1640 delete(ru.updateConsumers, key) 1641 } else { 1642 js.processConsumerRemoval(ca) 1643 } 1644 } 1645 for _, ca := range caAdd { 1646 js.setConsumerAssignmentRecovering(ca) 1647 if isRecovering { 1648 key := ca.recoveryKey() 1649 delete(ru.removeConsumers, key) 1650 ru.updateConsumers[key] = ca 1651 } else { 1652 js.processConsumerAssignment(ca) 1653 } 1654 } 1655 1656 return nil 1657 } 1658 1659 // Called on recovery to make sure we do not process like original. 1660 func (js *jetStream) setStreamAssignmentRecovering(sa *streamAssignment) { 1661 js.mu.Lock() 1662 defer js.mu.Unlock() 1663 sa.responded = true 1664 sa.recovering = true 1665 sa.Restore = nil 1666 if sa.Group != nil { 1667 sa.Group.Preferred = _EMPTY_ 1668 } 1669 } 1670 1671 // Called on recovery to make sure we do not process like original. 1672 func (js *jetStream) setConsumerAssignmentRecovering(ca *consumerAssignment) { 1673 js.mu.Lock() 1674 defer js.mu.Unlock() 1675 ca.responded = true 1676 ca.recovering = true 1677 if ca.Group != nil { 1678 ca.Group.Preferred = _EMPTY_ 1679 } 1680 } 1681 1682 // Just copies over and changes out the group so it can be encoded. 1683 // Lock should be held. 1684 func (sa *streamAssignment) copyGroup() *streamAssignment { 1685 csa, cg := *sa, *sa.Group 1686 csa.Group = &cg 1687 csa.Group.Peers = copyStrings(sa.Group.Peers) 1688 return &csa 1689 } 1690 1691 // Just copies over and changes out the group so it can be encoded. 1692 // Lock should be held. 1693 func (ca *consumerAssignment) copyGroup() *consumerAssignment { 1694 cca, cg := *ca, *ca.Group 1695 cca.Group = &cg 1696 cca.Group.Peers = copyStrings(ca.Group.Peers) 1697 return &cca 1698 } 1699 1700 // Lock should be held. 1701 func (sa *streamAssignment) missingPeers() bool { 1702 return len(sa.Group.Peers) < sa.Config.Replicas 1703 } 1704 1705 // Called when we detect a new peer. Only the leader will process checking 1706 // for any streams, and consequently any consumers. 1707 func (js *jetStream) processAddPeer(peer string) { 1708 js.mu.Lock() 1709 defer js.mu.Unlock() 1710 1711 s, cc := js.srv, js.cluster 1712 if cc == nil || cc.meta == nil { 1713 return 1714 } 1715 isLeader := cc.isLeader() 1716 1717 // Now check if we are meta-leader. We will check for any re-assignments. 1718 if !isLeader { 1719 return 1720 } 1721 1722 sir, ok := s.nodeToInfo.Load(peer) 1723 if !ok || sir == nil { 1724 return 1725 } 1726 si := sir.(nodeInfo) 1727 1728 for _, asa := range cc.streams { 1729 for _, sa := range asa { 1730 if sa.missingPeers() { 1731 // Make sure the right cluster etc. 1732 if si.cluster != sa.Client.Cluster { 1733 continue 1734 } 1735 // If we are here we can add in this peer. 1736 csa := sa.copyGroup() 1737 csa.Group.Peers = append(csa.Group.Peers, peer) 1738 // Send our proposal for this csa. Also use same group definition for all the consumers as well. 1739 cc.meta.Propose(encodeAddStreamAssignment(csa)) 1740 for _, ca := range sa.consumers { 1741 // Ephemerals are R=1, so only auto-remap durables, or R>1. 1742 if ca.Config.Durable != _EMPTY_ || len(ca.Group.Peers) > 1 { 1743 cca := ca.copyGroup() 1744 cca.Group.Peers = csa.Group.Peers 1745 cc.meta.Propose(encodeAddConsumerAssignment(cca)) 1746 } 1747 } 1748 } 1749 } 1750 } 1751 } 1752 1753 func (js *jetStream) processRemovePeer(peer string) { 1754 // We may be already disabled. 1755 if js == nil || js.disabled.Load() { 1756 return 1757 } 1758 1759 js.mu.Lock() 1760 s, cc := js.srv, js.cluster 1761 if cc == nil || cc.meta == nil { 1762 js.mu.Unlock() 1763 return 1764 } 1765 isLeader := cc.isLeader() 1766 // All nodes will check if this is them. 1767 isUs := cc.meta.ID() == peer 1768 js.mu.Unlock() 1769 1770 if isUs { 1771 s.Errorf("JetStream being DISABLED, our server was removed from the cluster") 1772 adv := &JSServerRemovedAdvisory{ 1773 TypedEvent: TypedEvent{ 1774 Type: JSServerRemovedAdvisoryType, 1775 ID: nuid.Next(), 1776 Time: time.Now().UTC(), 1777 }, 1778 Server: s.Name(), 1779 ServerID: s.ID(), 1780 Cluster: s.cachedClusterName(), 1781 Domain: s.getOpts().JetStreamDomain, 1782 } 1783 s.publishAdvisory(nil, JSAdvisoryServerRemoved, adv) 1784 1785 go s.DisableJetStream() 1786 } 1787 1788 // Now check if we are meta-leader. We will attempt re-assignment. 1789 if !isLeader { 1790 return 1791 } 1792 1793 js.mu.Lock() 1794 defer js.mu.Unlock() 1795 1796 for _, asa := range cc.streams { 1797 for _, sa := range asa { 1798 if rg := sa.Group; rg.isMember(peer) { 1799 js.removePeerFromStreamLocked(sa, peer) 1800 } 1801 } 1802 } 1803 } 1804 1805 // Assumes all checks have already been done. 1806 func (js *jetStream) removePeerFromStream(sa *streamAssignment, peer string) bool { 1807 js.mu.Lock() 1808 defer js.mu.Unlock() 1809 return js.removePeerFromStreamLocked(sa, peer) 1810 } 1811 1812 // Lock should be held. 1813 func (js *jetStream) removePeerFromStreamLocked(sa *streamAssignment, peer string) bool { 1814 if rg := sa.Group; !rg.isMember(peer) { 1815 return false 1816 } 1817 1818 s, cc, csa := js.srv, js.cluster, sa.copyGroup() 1819 if cc == nil || cc.meta == nil { 1820 return false 1821 } 1822 replaced := cc.remapStreamAssignment(csa, peer) 1823 if !replaced { 1824 s.Warnf("JetStream cluster could not replace peer for stream '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name) 1825 } 1826 1827 // Send our proposal for this csa. Also use same group definition for all the consumers as well. 1828 cc.meta.Propose(encodeAddStreamAssignment(csa)) 1829 rg := csa.Group 1830 for _, ca := range sa.consumers { 1831 // Ephemerals are R=1, so only auto-remap durables, or R>1. 1832 if ca.Config.Durable != _EMPTY_ { 1833 cca := ca.copyGroup() 1834 cca.Group.Peers, cca.Group.Preferred = rg.Peers, _EMPTY_ 1835 cc.meta.Propose(encodeAddConsumerAssignment(cca)) 1836 } else if ca.Group.isMember(peer) { 1837 // These are ephemerals. Check to see if we deleted this peer. 1838 cc.meta.Propose(encodeDeleteConsumerAssignment(ca)) 1839 } 1840 } 1841 return replaced 1842 } 1843 1844 // Check if we have peer related entries. 1845 func (js *jetStream) hasPeerEntries(entries []*Entry) bool { 1846 for _, e := range entries { 1847 if e.Type == EntryRemovePeer || e.Type == EntryAddPeer { 1848 return true 1849 } 1850 } 1851 return false 1852 } 1853 1854 const ksep = ":" 1855 1856 func (sa *streamAssignment) recoveryKey() string { 1857 if sa == nil { 1858 return _EMPTY_ 1859 } 1860 return sa.Client.serviceAccount() + ksep + sa.Config.Name 1861 } 1862 1863 func (ca *consumerAssignment) recoveryKey() string { 1864 if ca == nil { 1865 return _EMPTY_ 1866 } 1867 return ca.Client.serviceAccount() + ksep + ca.Stream + ksep + ca.Name 1868 } 1869 1870 func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bool, bool, bool, error) { 1871 var didSnap, didRemoveStream, didRemoveConsumer bool 1872 isRecovering := js.isMetaRecovering() 1873 1874 for _, e := range entries { 1875 if e.Type == EntrySnapshot { 1876 js.applyMetaSnapshot(e.Data, ru, isRecovering) 1877 didSnap = true 1878 } else if e.Type == EntryRemovePeer { 1879 if !isRecovering { 1880 js.processRemovePeer(string(e.Data)) 1881 } 1882 } else if e.Type == EntryAddPeer { 1883 if !isRecovering { 1884 js.processAddPeer(string(e.Data)) 1885 } 1886 } else { 1887 buf := e.Data 1888 switch entryOp(buf[0]) { 1889 case assignStreamOp: 1890 sa, err := decodeStreamAssignment(buf[1:]) 1891 if err != nil { 1892 js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:]) 1893 return didSnap, didRemoveStream, didRemoveConsumer, err 1894 } 1895 if isRecovering { 1896 js.setStreamAssignmentRecovering(sa) 1897 delete(ru.removeStreams, sa.recoveryKey()) 1898 } 1899 if js.processStreamAssignment(sa) { 1900 didRemoveStream = true 1901 } 1902 case removeStreamOp: 1903 sa, err := decodeStreamAssignment(buf[1:]) 1904 if err != nil { 1905 js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:]) 1906 return didSnap, didRemoveStream, didRemoveConsumer, err 1907 } 1908 if isRecovering { 1909 js.setStreamAssignmentRecovering(sa) 1910 key := sa.recoveryKey() 1911 ru.removeStreams[key] = sa 1912 delete(ru.updateStreams, key) 1913 } else { 1914 js.processStreamRemoval(sa) 1915 didRemoveStream = true 1916 } 1917 case assignConsumerOp: 1918 ca, err := decodeConsumerAssignment(buf[1:]) 1919 if err != nil { 1920 js.srv.Errorf("JetStream cluster failed to decode consumer assignment: %q", buf[1:]) 1921 return didSnap, didRemoveStream, didRemoveConsumer, err 1922 } 1923 if isRecovering { 1924 js.setConsumerAssignmentRecovering(ca) 1925 key := ca.recoveryKey() 1926 delete(ru.removeConsumers, key) 1927 ru.updateConsumers[key] = ca 1928 } else { 1929 js.processConsumerAssignment(ca) 1930 } 1931 case assignCompressedConsumerOp: 1932 ca, err := decodeConsumerAssignmentCompressed(buf[1:]) 1933 if err != nil { 1934 js.srv.Errorf("JetStream cluster failed to decode compressed consumer assignment: %q", buf[1:]) 1935 return didSnap, didRemoveStream, didRemoveConsumer, err 1936 } 1937 if isRecovering { 1938 js.setConsumerAssignmentRecovering(ca) 1939 key := ca.recoveryKey() 1940 delete(ru.removeConsumers, key) 1941 ru.updateConsumers[key] = ca 1942 } else { 1943 js.processConsumerAssignment(ca) 1944 } 1945 case removeConsumerOp: 1946 ca, err := decodeConsumerAssignment(buf[1:]) 1947 if err != nil { 1948 js.srv.Errorf("JetStream cluster failed to decode consumer assignment: %q", buf[1:]) 1949 return didSnap, didRemoveStream, didRemoveConsumer, err 1950 } 1951 if isRecovering { 1952 js.setConsumerAssignmentRecovering(ca) 1953 key := ca.recoveryKey() 1954 ru.removeConsumers[key] = ca 1955 delete(ru.updateConsumers, key) 1956 } else { 1957 js.processConsumerRemoval(ca) 1958 didRemoveConsumer = true 1959 } 1960 case updateStreamOp: 1961 sa, err := decodeStreamAssignment(buf[1:]) 1962 if err != nil { 1963 js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:]) 1964 return didSnap, didRemoveStream, didRemoveConsumer, err 1965 } 1966 if isRecovering { 1967 js.setStreamAssignmentRecovering(sa) 1968 key := sa.recoveryKey() 1969 ru.updateStreams[key] = sa 1970 delete(ru.removeStreams, key) 1971 } else { 1972 js.processUpdateStreamAssignment(sa) 1973 // Since an update can be lowering replica count, we want upper layer to treat 1974 // similar to a removal and snapshot to collapse old entries. 1975 didRemoveStream = true 1976 } 1977 default: 1978 panic(fmt.Sprintf("JetStream Cluster Unknown meta entry op type: %v", entryOp(buf[0]))) 1979 } 1980 } 1981 } 1982 return didSnap, didRemoveStream, didRemoveConsumer, nil 1983 } 1984 1985 func (rg *raftGroup) isMember(id string) bool { 1986 if rg == nil { 1987 return false 1988 } 1989 for _, peer := range rg.Peers { 1990 if peer == id { 1991 return true 1992 } 1993 } 1994 return false 1995 } 1996 1997 func (rg *raftGroup) setPreferred() { 1998 if rg == nil || len(rg.Peers) == 0 { 1999 return 2000 } 2001 if len(rg.Peers) == 1 { 2002 rg.Preferred = rg.Peers[0] 2003 } else { 2004 // For now just randomly select a peer for the preferred. 2005 pi := rand.Int31n(int32(len(rg.Peers))) 2006 rg.Preferred = rg.Peers[pi] 2007 } 2008 } 2009 2010 // createRaftGroup is called to spin up this raft group if needed. 2011 func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, storage StorageType, labels pprofLabels) error { 2012 js.mu.Lock() 2013 s, cc := js.srv, js.cluster 2014 if cc == nil || cc.meta == nil { 2015 js.mu.Unlock() 2016 return NewJSClusterNotActiveError() 2017 } 2018 2019 // If this is a single peer raft group or we are not a member return. 2020 if len(rg.Peers) <= 1 || !rg.isMember(cc.meta.ID()) { 2021 js.mu.Unlock() 2022 // Nothing to do here. 2023 return nil 2024 } 2025 2026 // Check if we already have this assigned. 2027 if node := s.lookupRaftNode(rg.Name); node != nil { 2028 s.Debugf("JetStream cluster already has raft group %q assigned", rg.Name) 2029 rg.node = node 2030 js.mu.Unlock() 2031 return nil 2032 } 2033 2034 s.Debugf("JetStream cluster creating raft group:%+v", rg) 2035 js.mu.Unlock() 2036 2037 sysAcc := s.SystemAccount() 2038 if sysAcc == nil { 2039 s.Debugf("JetStream cluster detected shutdown processing raft group: %+v", rg) 2040 return errors.New("shutting down") 2041 } 2042 2043 // Check here to see if we have a max HA Assets limit set. 2044 if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets > 0 { 2045 if s.numRaftNodes() > maxHaAssets { 2046 s.Warnf("Maximum HA Assets limit reached: %d", maxHaAssets) 2047 // Since the meta leader assigned this, send a statsz update to them to get them up to date. 2048 go s.sendStatszUpdate() 2049 return errors.New("system limit reached") 2050 } 2051 } 2052 2053 storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, rg.Name) 2054 var store StreamStore 2055 if storage == FileStorage { 2056 fs, err := newFileStoreWithCreated( 2057 FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncInterval: 5 * time.Minute, srv: s}, 2058 StreamConfig{Name: rg.Name, Storage: FileStorage, Metadata: labels}, 2059 time.Now().UTC(), 2060 s.jsKeyGen(s.getOpts().JetStreamKey, rg.Name), 2061 s.jsKeyGen(s.getOpts().JetStreamOldKey, rg.Name), 2062 ) 2063 if err != nil { 2064 s.Errorf("Error creating filestore WAL: %v", err) 2065 return err 2066 } 2067 store = fs 2068 } else { 2069 ms, err := newMemStore(&StreamConfig{Name: rg.Name, Storage: MemoryStorage}) 2070 if err != nil { 2071 s.Errorf("Error creating memstore WAL: %v", err) 2072 return err 2073 } 2074 store = ms 2075 } 2076 2077 cfg := &RaftConfig{Name: rg.Name, Store: storeDir, Log: store, Track: true} 2078 2079 if _, err := readPeerState(storeDir); err != nil { 2080 s.bootstrapRaftNode(cfg, rg.Peers, true) 2081 } 2082 2083 n, err := s.startRaftNode(accName, cfg, labels) 2084 if err != nil || n == nil { 2085 s.Debugf("Error creating raft group: %v", err) 2086 return err 2087 } 2088 // Need locking here for the assignment to avoid data-race reports 2089 js.mu.Lock() 2090 rg.node = n 2091 // See if we are preferred and should start campaign immediately. 2092 if n.ID() == rg.Preferred && n.Term() == 0 { 2093 n.Campaign() 2094 } 2095 js.mu.Unlock() 2096 return nil 2097 } 2098 2099 func (mset *stream) raftGroup() *raftGroup { 2100 if mset == nil { 2101 return nil 2102 } 2103 mset.mu.RLock() 2104 defer mset.mu.RUnlock() 2105 if mset.sa == nil { 2106 return nil 2107 } 2108 return mset.sa.Group 2109 } 2110 2111 func (mset *stream) raftNode() RaftNode { 2112 if mset == nil { 2113 return nil 2114 } 2115 mset.mu.RLock() 2116 defer mset.mu.RUnlock() 2117 return mset.node 2118 } 2119 2120 func (mset *stream) removeNode() { 2121 mset.mu.Lock() 2122 defer mset.mu.Unlock() 2123 if n := mset.node; n != nil { 2124 n.Delete() 2125 mset.node = nil 2126 } 2127 } 2128 2129 func (mset *stream) clearRaftNode() { 2130 if mset == nil { 2131 return 2132 } 2133 mset.mu.Lock() 2134 defer mset.mu.Unlock() 2135 mset.node = nil 2136 } 2137 2138 // Helper function to generate peer info. 2139 // lists and sets for old and new. 2140 func genPeerInfo(peers []string, split int) (newPeers, oldPeers []string, newPeerSet, oldPeerSet map[string]bool) { 2141 newPeers = peers[split:] 2142 oldPeers = peers[:split] 2143 newPeerSet = make(map[string]bool, len(newPeers)) 2144 oldPeerSet = make(map[string]bool, len(oldPeers)) 2145 for i, peer := range peers { 2146 if i < split { 2147 oldPeerSet[peer] = true 2148 } else { 2149 newPeerSet[peer] = true 2150 } 2151 } 2152 return 2153 } 2154 2155 // This will wait for a period of time until all consumers are registered and have 2156 // their consumer assignments assigned. 2157 // Should only be called from monitorStream. 2158 func (mset *stream) waitOnConsumerAssignments() { 2159 mset.mu.RLock() 2160 s, js, acc, sa, name := mset.srv, mset.js, mset.acc, mset.sa, mset.cfg.Name 2161 mset.mu.RUnlock() 2162 2163 if s == nil || js == nil || acc == nil || sa == nil { 2164 return 2165 } 2166 2167 js.mu.RLock() 2168 numExpectedConsumers := len(sa.consumers) 2169 js.mu.RUnlock() 2170 2171 // Max to wait. 2172 const maxWaitTime = 10 * time.Second 2173 const sleepTime = 500 * time.Millisecond 2174 2175 // Wait up to 10s 2176 timeout := time.Now().Add(maxWaitTime) 2177 for time.Now().Before(timeout) { 2178 var numReady int 2179 for _, o := range mset.getConsumers() { 2180 // Make sure we are registered with our consumer assignment. 2181 if ca := o.consumerAssignment(); ca != nil { 2182 numReady++ 2183 } else { 2184 break 2185 } 2186 } 2187 // Check if we are good. 2188 if numReady >= numExpectedConsumers { 2189 break 2190 } 2191 2192 s.Debugf("Waiting for consumers for interest based stream '%s > %s'", acc.Name, name) 2193 select { 2194 case <-s.quitCh: 2195 return 2196 case <-mset.monitorQuitC(): 2197 return 2198 case <-time.After(sleepTime): 2199 } 2200 } 2201 2202 if actual := mset.numConsumers(); actual < numExpectedConsumers { 2203 s.Warnf("All consumers not online for '%s > %s': expected %d but only have %d", acc.Name, name, numExpectedConsumers, actual) 2204 } 2205 } 2206 2207 // Monitor our stream node for this stream. 2208 func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnapshot bool) { 2209 s, cc := js.server(), js.cluster 2210 defer s.grWG.Done() 2211 if mset != nil { 2212 defer mset.monitorWg.Done() 2213 } 2214 js.mu.RLock() 2215 n := sa.Group.node 2216 meta := cc.meta 2217 js.mu.RUnlock() 2218 2219 if n == nil || meta == nil { 2220 s.Warnf("No RAFT group for '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name) 2221 return 2222 } 2223 2224 // Make sure only one is running. 2225 if mset != nil { 2226 if mset.checkInMonitor() { 2227 return 2228 } 2229 defer mset.clearMonitorRunning() 2230 } 2231 2232 // Make sure to stop the raft group on exit to prevent accidental memory bloat. 2233 // This should be below the checkInMonitor call though to avoid stopping it out 2234 // from underneath the one that is running since it will be the same raft node. 2235 defer n.Stop() 2236 2237 qch, mqch, lch, aq, uch, ourPeerId := n.QuitC(), mset.monitorQuitC(), n.LeadChangeC(), n.ApplyQ(), mset.updateC(), meta.ID() 2238 2239 s.Debugf("Starting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group()) 2240 defer s.Debugf("Exiting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group()) 2241 2242 // Make sure we do not leave the apply channel to fill up and block the raft layer. 2243 defer func() { 2244 if n.State() == Closed { 2245 return 2246 } 2247 if n.Leader() { 2248 n.StepDown() 2249 } 2250 // Drain the commit queue... 2251 aq.drain() 2252 }() 2253 2254 const ( 2255 compactInterval = 2 * time.Minute 2256 compactSizeMin = 8 * 1024 * 1024 2257 compactNumMin = 65536 2258 minSnapDelta = 10 * time.Second 2259 ) 2260 2261 // Spread these out for large numbers on server restart. 2262 rci := time.Duration(rand.Int63n(int64(time.Minute))) 2263 t := time.NewTicker(compactInterval + rci) 2264 defer t.Stop() 2265 2266 js.mu.RLock() 2267 isLeader := cc.isStreamLeader(sa.Client.serviceAccount(), sa.Config.Name) 2268 isRestore := sa.Restore != nil 2269 js.mu.RUnlock() 2270 2271 acc, err := s.LookupAccount(sa.Client.serviceAccount()) 2272 if err != nil { 2273 s.Warnf("Could not retrieve account for stream '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name) 2274 return 2275 } 2276 accName := acc.GetName() 2277 2278 // Used to represent how we can detect a changed state quickly and without representing 2279 // a complete and detailed state which could be costly in terms of memory, cpu and GC. 2280 // This only entails how many messages, and the first and last sequence of the stream. 2281 // This is all that is needed to detect a change, and we can get this from FilteredState() 2282 // with and empty filter. 2283 var lastState SimpleState 2284 var lastSnapTime time.Time 2285 2286 // Don't allow the upper layer to install snapshots until we have 2287 // fully recovered from disk. 2288 isRecovering := true 2289 2290 // Should only to be called from leader. 2291 doSnapshot := func() { 2292 if mset == nil || isRecovering || isRestore || time.Since(lastSnapTime) < minSnapDelta { 2293 return 2294 } 2295 2296 // Before we actually calculate the detailed state and encode it, let's check the 2297 // simple state to detect any changes. 2298 curState := mset.store.FilteredState(0, _EMPTY_) 2299 2300 // If the state hasn't changed but the log has gone way over 2301 // the compaction size then we will want to compact anyway. 2302 // This shouldn't happen for streams like it can for pull 2303 // consumers on idle streams but better to be safe than sorry! 2304 ne, nb := n.Size() 2305 if curState == lastState && ne < compactNumMin && nb < compactSizeMin { 2306 return 2307 } 2308 2309 if err := n.InstallSnapshot(mset.stateSnapshot()); err == nil { 2310 lastState, lastSnapTime = curState, time.Now() 2311 } else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning { 2312 s.RateLimitWarnf("Failed to install snapshot for '%s > %s' [%s]: %v", mset.acc.Name, mset.name(), n.Group(), err) 2313 } 2314 } 2315 2316 // We will establish a restoreDoneCh no matter what. Will never be triggered unless 2317 // we replace with the restore chan. 2318 restoreDoneCh := make(<-chan error) 2319 2320 // For migration tracking. 2321 var mmt *time.Ticker 2322 var mmtc <-chan time.Time 2323 2324 startMigrationMonitoring := func() { 2325 if mmt == nil { 2326 mmt = time.NewTicker(500 * time.Millisecond) 2327 mmtc = mmt.C 2328 } 2329 } 2330 2331 stopMigrationMonitoring := func() { 2332 if mmt != nil { 2333 mmt.Stop() 2334 mmt, mmtc = nil, nil 2335 } 2336 } 2337 defer stopMigrationMonitoring() 2338 2339 // This is to optionally track when we are ready as a non-leader for direct access participation. 2340 // Either direct or if we are a direct mirror, or both. 2341 var dat *time.Ticker 2342 var datc <-chan time.Time 2343 2344 startDirectAccessMonitoring := func() { 2345 if dat == nil { 2346 dat = time.NewTicker(2 * time.Second) 2347 datc = dat.C 2348 } 2349 } 2350 2351 stopDirectMonitoring := func() { 2352 if dat != nil { 2353 dat.Stop() 2354 dat, datc = nil, nil 2355 } 2356 } 2357 defer stopDirectMonitoring() 2358 2359 if mset != nil && mset.isInterestRetention() { 2360 // Wait on our consumers to be assigned and running before proceeding. 2361 // This can become important when a server has lots of assets 2362 // since we process streams first then consumers as an asset class. 2363 mset.waitOnConsumerAssignments() 2364 } 2365 2366 // This is triggered during a scale up from R1 to clustered mode. We need the new followers to catchup, 2367 // similar to how we trigger the catchup mechanism post a backup/restore. 2368 // We can arrive here NOT being the leader, so we send the snapshot only if we are, and in this case 2369 // reset the notion that we need to send the snapshot. If we are not, then the first time the server 2370 // will switch to leader (in the loop below), we will send the snapshot. 2371 if sendSnapshot && isLeader && mset != nil && n != nil && !isRecovering { 2372 n.SendSnapshot(mset.stateSnapshot()) 2373 sendSnapshot = false 2374 } 2375 2376 for { 2377 select { 2378 case <-s.quitCh: 2379 return 2380 case <-mqch: 2381 return 2382 case <-qch: 2383 return 2384 case <-aq.ch: 2385 var ne, nb uint64 2386 ces := aq.pop() 2387 for _, ce := range ces { 2388 // No special processing needed for when we are caught up on restart. 2389 if ce == nil { 2390 isRecovering = false 2391 // If we are interest based make sure to check consumers if interest retention policy. 2392 // This is to make sure we process any outstanding acks from all consumers. 2393 mset.checkInterestState() 2394 // Make sure we create a new snapshot in case things have changed such that any existing 2395 // snapshot may no longer be valid. 2396 doSnapshot() 2397 // If we became leader during this time and we need to send a snapshot to our 2398 // followers, i.e. as a result of a scale-up from R1, do it now. 2399 if sendSnapshot && isLeader && mset != nil && n != nil { 2400 n.SendSnapshot(mset.stateSnapshot()) 2401 sendSnapshot = false 2402 } 2403 continue 2404 } 2405 // Apply our entries. 2406 if err := js.applyStreamEntries(mset, ce, isRecovering); err == nil { 2407 // Update our applied. 2408 ne, nb = n.Applied(ce.Index) 2409 ce.ReturnToPool() 2410 } else { 2411 // Our stream was closed out from underneath of us, simply return here. 2412 if err == errStreamClosed { 2413 return 2414 } 2415 s.Warnf("Error applying entries to '%s > %s': %v", accName, sa.Config.Name, err) 2416 if isClusterResetErr(err) { 2417 if mset.isMirror() && mset.IsLeader() { 2418 mset.retryMirrorConsumer() 2419 continue 2420 } 2421 // We will attempt to reset our cluster state. 2422 if mset.resetClusteredState(err) { 2423 aq.recycle(&ces) 2424 return 2425 } 2426 } else if isOutOfSpaceErr(err) { 2427 // If applicable this will tear all of this down, but don't assume so and return. 2428 s.handleOutOfSpace(mset) 2429 } 2430 } 2431 } 2432 aq.recycle(&ces) 2433 2434 // Check about snapshotting 2435 // If we have at least min entries to compact, go ahead and try to snapshot/compact. 2436 if ne >= compactNumMin || nb > compactSizeMin { 2437 doSnapshot() 2438 } 2439 2440 case isLeader = <-lch: 2441 if isLeader { 2442 if mset != nil && n != nil && sendSnapshot && !isRecovering { 2443 // If we *are* recovering at the time then this will get done when the apply queue 2444 // handles the nil guard to show the catchup ended. 2445 n.SendSnapshot(mset.stateSnapshot()) 2446 sendSnapshot = false 2447 } 2448 if isRestore { 2449 acc, _ := s.LookupAccount(sa.Client.serviceAccount()) 2450 restoreDoneCh = s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_) 2451 continue 2452 } else if n != nil && n.NeedSnapshot() { 2453 doSnapshot() 2454 } 2455 // Always cancel if this was running. 2456 stopDirectMonitoring() 2457 2458 } else if n.GroupLeader() != noLeader { 2459 js.setStreamAssignmentRecovering(sa) 2460 } 2461 2462 // Process our leader change. 2463 js.processStreamLeaderChange(mset, isLeader) 2464 2465 // We may receive a leader change after the stream assignment which would cancel us 2466 // monitoring for this closely. So re-assess our state here as well. 2467 // Or the old leader is no longer part of the set and transferred leadership 2468 // for this leader to resume with removal 2469 migrating := mset.isMigrating() 2470 2471 // Check for migrations here. We set the state on the stream assignment update below. 2472 if isLeader && migrating { 2473 startMigrationMonitoring() 2474 } 2475 2476 // Here we are checking if we are not the leader but we have been asked to allow 2477 // direct access. We now allow non-leaders to participate in the queue group. 2478 if !isLeader && mset != nil { 2479 mset.mu.RLock() 2480 ad, md := mset.cfg.AllowDirect, mset.cfg.MirrorDirect 2481 mset.mu.RUnlock() 2482 if ad || md { 2483 startDirectAccessMonitoring() 2484 } 2485 } 2486 2487 case <-datc: 2488 if mset == nil || isRecovering { 2489 continue 2490 } 2491 // If we are leader we can stop, we know this is setup now. 2492 if isLeader { 2493 stopDirectMonitoring() 2494 continue 2495 } 2496 2497 mset.mu.Lock() 2498 ad, md, current := mset.cfg.AllowDirect, mset.cfg.MirrorDirect, mset.isCurrent() 2499 if !current { 2500 const syncThreshold = 90.0 2501 // We are not current, but current means exactly caught up. Under heavy publish 2502 // loads we may never reach this, so check if we are within 90% caught up. 2503 _, c, a := mset.node.Progress() 2504 if c == 0 { 2505 mset.mu.Unlock() 2506 continue 2507 } 2508 if p := float64(a) / float64(c) * 100.0; p < syncThreshold { 2509 mset.mu.Unlock() 2510 continue 2511 } else { 2512 s.Debugf("Stream '%s > %s' enabling direct gets at %.0f%% synchronized", 2513 sa.Client.serviceAccount(), sa.Config.Name, p) 2514 } 2515 } 2516 // We are current, cancel monitoring and create the direct subs as needed. 2517 if ad { 2518 mset.subscribeToDirect() 2519 } 2520 if md { 2521 mset.subscribeToMirrorDirect() 2522 } 2523 mset.mu.Unlock() 2524 // Stop direct monitoring. 2525 stopDirectMonitoring() 2526 2527 case <-t.C: 2528 doSnapshot() 2529 2530 case <-uch: 2531 // keep stream assignment current 2532 sa = mset.streamAssignment() 2533 2534 // keep peer list up to date with config 2535 js.checkPeers(mset.raftGroup()) 2536 // We get this when we have a new stream assignment caused by an update. 2537 // We want to know if we are migrating. 2538 if migrating := mset.isMigrating(); migrating { 2539 if isLeader && mmtc == nil { 2540 startMigrationMonitoring() 2541 } 2542 } else { 2543 stopMigrationMonitoring() 2544 } 2545 case <-mmtc: 2546 if !isLeader { 2547 // We are no longer leader, so not our job. 2548 stopMigrationMonitoring() 2549 continue 2550 } 2551 2552 // Check to see where we are.. 2553 rg := mset.raftGroup() 2554 2555 // Track the new peers and check the ones that are current. 2556 mset.mu.RLock() 2557 replicas := mset.cfg.Replicas 2558 mset.mu.RUnlock() 2559 if len(rg.Peers) <= replicas { 2560 // Migration no longer happening, so not our job anymore 2561 stopMigrationMonitoring() 2562 continue 2563 } 2564 2565 // Make sure we have correct cluster information on the other peers. 2566 ci := js.clusterInfo(rg) 2567 mset.checkClusterInfo(ci) 2568 2569 newPeers, oldPeers, newPeerSet, oldPeerSet := genPeerInfo(rg.Peers, len(rg.Peers)-replicas) 2570 2571 // If we are part of the new peerset and we have been passed the baton. 2572 // We will handle scale down. 2573 if newPeerSet[ourPeerId] { 2574 // First need to check on any consumers and make sure they have moved properly before scaling down ourselves. 2575 js.mu.RLock() 2576 var needToWait bool 2577 for name, c := range sa.consumers { 2578 for _, peer := range c.Group.Peers { 2579 // If we have peers still in the old set block. 2580 if oldPeerSet[peer] { 2581 s.Debugf("Scale down of '%s > %s' blocked by consumer '%s'", accName, sa.Config.Name, name) 2582 needToWait = true 2583 break 2584 } 2585 } 2586 if needToWait { 2587 break 2588 } 2589 } 2590 js.mu.RUnlock() 2591 if needToWait { 2592 continue 2593 } 2594 2595 // We are good to go, can scale down here. 2596 for _, p := range oldPeers { 2597 n.ProposeRemovePeer(p) 2598 } 2599 2600 csa := sa.copyGroup() 2601 csa.Group.Peers = newPeers 2602 csa.Group.Preferred = ourPeerId 2603 csa.Group.Cluster = s.cachedClusterName() 2604 cc.meta.ForwardProposal(encodeUpdateStreamAssignment(csa)) 2605 s.Noticef("Scaling down '%s > %s' to %+v", accName, sa.Config.Name, s.peerSetToNames(newPeers)) 2606 } else { 2607 // We are the old leader here, from the original peer set. 2608 // We are simply waiting on the new peerset to be caught up so we can transfer leadership. 2609 var newLeaderPeer, newLeader string 2610 neededCurrent, current := replicas/2+1, 0 2611 2612 for _, r := range ci.Replicas { 2613 if r.Current && newPeerSet[r.Peer] { 2614 current++ 2615 if newLeader == _EMPTY_ { 2616 newLeaderPeer, newLeader = r.Peer, r.Name 2617 } 2618 } 2619 } 2620 // Check if we have a quorom. 2621 if current >= neededCurrent { 2622 s.Noticef("Transfer of stream leader for '%s > %s' to '%s'", accName, sa.Config.Name, newLeader) 2623 n.UpdateKnownPeers(newPeers) 2624 n.StepDown(newLeaderPeer) 2625 } 2626 } 2627 2628 case err := <-restoreDoneCh: 2629 // We have completed a restore from snapshot on this server. The stream assignment has 2630 // already been assigned but the replicas will need to catch up out of band. Consumers 2631 // will need to be assigned by forwarding the proposal and stamping the initial state. 2632 s.Debugf("Stream restore for '%s > %s' completed", sa.Client.serviceAccount(), sa.Config.Name) 2633 if err != nil { 2634 s.Debugf("Stream restore failed: %v", err) 2635 } 2636 isRestore = false 2637 sa.Restore = nil 2638 // If we were successful lookup up our stream now. 2639 if err == nil { 2640 if mset, err = acc.lookupStream(sa.Config.Name); mset != nil { 2641 mset.monitorWg.Add(1) 2642 defer mset.monitorWg.Done() 2643 mset.setStreamAssignment(sa) 2644 // Make sure to update our updateC which would have been nil. 2645 uch = mset.updateC() 2646 // Also update our mqch 2647 mqch = mset.monitorQuitC() 2648 } 2649 } 2650 if err != nil { 2651 if mset != nil { 2652 mset.delete() 2653 } 2654 js.mu.Lock() 2655 sa.err = err 2656 if n != nil { 2657 n.Delete() 2658 } 2659 result := &streamAssignmentResult{ 2660 Account: sa.Client.serviceAccount(), 2661 Stream: sa.Config.Name, 2662 Restore: &JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}}, 2663 } 2664 result.Restore.Error = NewJSStreamAssignmentError(err, Unless(err)) 2665 js.mu.Unlock() 2666 // Send response to the metadata leader. They will forward to the user as needed. 2667 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result) 2668 return 2669 } 2670 2671 if !isLeader { 2672 panic("Finished restore but not leader") 2673 } 2674 // Trigger the stream followers to catchup. 2675 if n = mset.raftNode(); n != nil { 2676 n.SendSnapshot(mset.stateSnapshot()) 2677 } 2678 js.processStreamLeaderChange(mset, isLeader) 2679 2680 // Check to see if we have restored consumers here. 2681 // These are not currently assigned so we will need to do so here. 2682 if consumers := mset.getPublicConsumers(); len(consumers) > 0 { 2683 for _, o := range consumers { 2684 name, cfg := o.String(), o.config() 2685 rg := cc.createGroupForConsumer(&cfg, sa) 2686 // Pick a preferred leader. 2687 rg.setPreferred() 2688 2689 // Place our initial state here as well for assignment distribution. 2690 state, _ := o.store.State() 2691 ca := &consumerAssignment{ 2692 Group: rg, 2693 Stream: sa.Config.Name, 2694 Name: name, 2695 Config: &cfg, 2696 Client: sa.Client, 2697 Created: o.createdTime(), 2698 State: state, 2699 } 2700 2701 // We make these compressed in case state is complex. 2702 addEntry := encodeAddConsumerAssignmentCompressed(ca) 2703 cc.meta.ForwardProposal(addEntry) 2704 2705 // Check to make sure we see the assignment. 2706 go func() { 2707 ticker := time.NewTicker(time.Second) 2708 defer ticker.Stop() 2709 for range ticker.C { 2710 js.mu.RLock() 2711 ca, meta := js.consumerAssignment(ca.Client.serviceAccount(), sa.Config.Name, name), cc.meta 2712 js.mu.RUnlock() 2713 if ca == nil { 2714 s.Warnf("Consumer assignment has not been assigned, retrying") 2715 if meta != nil { 2716 meta.ForwardProposal(addEntry) 2717 } else { 2718 return 2719 } 2720 } else { 2721 return 2722 } 2723 } 2724 }() 2725 } 2726 } 2727 } 2728 } 2729 } 2730 2731 // Determine if we are migrating 2732 func (mset *stream) isMigrating() bool { 2733 if mset == nil { 2734 return false 2735 } 2736 2737 mset.mu.RLock() 2738 js, sa := mset.js, mset.sa 2739 mset.mu.RUnlock() 2740 2741 js.mu.RLock() 2742 defer js.mu.RUnlock() 2743 2744 // During migration we will always be R>1, even when we start R1. 2745 // So if we do not have a group or node we no we are not migrating. 2746 if sa == nil || sa.Group == nil || sa.Group.node == nil { 2747 return false 2748 } 2749 // The sign of migration is if our group peer count != configured replica count. 2750 if sa.Config.Replicas == len(sa.Group.Peers) { 2751 return false 2752 } 2753 return true 2754 } 2755 2756 // resetClusteredState is called when a clustered stream had an error (e.g sequence mismatch, bad snapshot) and needs to be reset. 2757 func (mset *stream) resetClusteredState(err error) bool { 2758 mset.mu.RLock() 2759 s, js, jsa, sa, acc, node := mset.srv, mset.js, mset.jsa, mset.sa, mset.acc, mset.node 2760 stype, isLeader, tierName, replicas := mset.cfg.Storage, mset.isLeader(), mset.tier, mset.cfg.Replicas 2761 mset.mu.RUnlock() 2762 2763 // Stepdown regardless if we are the leader here. 2764 if isLeader && node != nil { 2765 node.StepDown() 2766 } 2767 2768 // If we detect we are shutting down just return. 2769 if js != nil && js.isShuttingDown() { 2770 s.Debugf("Will not reset stream, jetstream shutting down") 2771 return false 2772 } 2773 2774 // Server 2775 if js.limitsExceeded(stype) { 2776 s.Warnf("Will not reset stream, server resources exceeded") 2777 return false 2778 } 2779 2780 // Account 2781 if exceeded, _ := jsa.limitsExceeded(stype, tierName, replicas); exceeded { 2782 s.Warnf("stream '%s > %s' errored, account resources exceeded", acc, mset.name()) 2783 return false 2784 } 2785 2786 // We delete our raft state. Will recreate. 2787 if node != nil { 2788 node.Delete() 2789 } 2790 2791 // Preserve our current state and messages unless we have a first sequence mismatch. 2792 shouldDelete := err == errFirstSequenceMismatch 2793 2794 // Need to do the rest in a separate Go routine. 2795 go func() { 2796 mset.monitorWg.Wait() 2797 mset.resetAndWaitOnConsumers() 2798 // Stop our stream. 2799 mset.stop(shouldDelete, false) 2800 2801 if sa != nil { 2802 js.mu.Lock() 2803 if js.shuttingDown { 2804 js.mu.Unlock() 2805 return 2806 } 2807 2808 s.Warnf("Resetting stream cluster state for '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name) 2809 // Now wipe groups from assignments. 2810 sa.Group.node = nil 2811 var consumers []*consumerAssignment 2812 if cc := js.cluster; cc != nil && cc.meta != nil { 2813 ourID := cc.meta.ID() 2814 for _, ca := range sa.consumers { 2815 if rg := ca.Group; rg != nil && rg.isMember(ourID) { 2816 rg.node = nil // Erase group raft/node state. 2817 consumers = append(consumers, ca) 2818 } 2819 } 2820 } 2821 js.mu.Unlock() 2822 2823 // This will reset the stream and consumers. 2824 // Reset stream. 2825 js.processClusterCreateStream(acc, sa) 2826 // Reset consumers. 2827 for _, ca := range consumers { 2828 js.processClusterCreateConsumer(ca, nil, false) 2829 } 2830 } 2831 }() 2832 2833 return true 2834 } 2835 2836 func isControlHdr(hdr []byte) bool { 2837 return bytes.HasPrefix(hdr, []byte("NATS/1.0 100 ")) 2838 } 2839 2840 // Apply our stream entries. 2841 func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isRecovering bool) error { 2842 for _, e := range ce.Entries { 2843 if e.Type == EntryNormal { 2844 buf, op := e.Data, entryOp(e.Data[0]) 2845 switch op { 2846 case streamMsgOp, compressedStreamMsgOp: 2847 if mset == nil { 2848 continue 2849 } 2850 s := js.srv 2851 2852 mbuf := buf[1:] 2853 if op == compressedStreamMsgOp { 2854 var err error 2855 mbuf, err = s2.Decode(nil, mbuf) 2856 if err != nil { 2857 panic(err.Error()) 2858 } 2859 } 2860 2861 subject, reply, hdr, msg, lseq, ts, err := decodeStreamMsg(mbuf) 2862 if err != nil { 2863 if node := mset.raftNode(); node != nil { 2864 s.Errorf("JetStream cluster could not decode stream msg for '%s > %s' [%s]", 2865 mset.account(), mset.name(), node.Group()) 2866 } 2867 panic(err.Error()) 2868 } 2869 2870 // Check for flowcontrol here. 2871 if len(msg) == 0 && len(hdr) > 0 && reply != _EMPTY_ && isControlHdr(hdr) { 2872 if !isRecovering { 2873 mset.sendFlowControlReply(reply) 2874 } 2875 continue 2876 } 2877 2878 // Grab last sequence and CLFS. 2879 last, clfs := mset.lastSeqAndCLFS() 2880 2881 // We can skip if we know this is less than what we already have. 2882 if lseq-clfs < last { 2883 s.Debugf("Apply stream entries for '%s > %s' skipping message with sequence %d with last of %d", 2884 mset.account(), mset.name(), lseq+1-clfs, last) 2885 mset.mu.Lock() 2886 // Check for any preAcks in case we are interest based. 2887 mset.clearAllPreAcks(lseq + 1 - clfs) 2888 mset.mu.Unlock() 2889 continue 2890 } 2891 2892 // Skip by hand here since first msg special case. 2893 // Reason is sequence is unsigned and for lseq being 0 2894 // the lseq under stream would have to be -1. 2895 if lseq == 0 && last != 0 { 2896 continue 2897 } 2898 2899 // Messages to be skipped have no subject or timestamp or msg or hdr. 2900 if subject == _EMPTY_ && ts == 0 && len(msg) == 0 && len(hdr) == 0 { 2901 // Skip and update our lseq. 2902 last := mset.store.SkipMsg() 2903 mset.setLastSeq(last) 2904 mset.clearAllPreAcks(last) 2905 continue 2906 } 2907 2908 var mt *msgTrace 2909 // If not recovering, see if we find a message trace object for this 2910 // sequence. Only the leader that has proposed this entry will have 2911 // stored the trace info. 2912 if !isRecovering { 2913 mt = mset.getAndDeleteMsgTrace(lseq) 2914 } 2915 // Process the actual message here. 2916 err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt) 2917 2918 // If we have inflight make sure to clear after processing. 2919 // TODO(dlc) - technically check on inflight != nil could cause datarace. 2920 // But do not want to acquire lock since tracking this will be rare. 2921 if mset.inflight != nil { 2922 mset.clMu.Lock() 2923 delete(mset.inflight, lseq) 2924 mset.clMu.Unlock() 2925 } 2926 2927 if err != nil { 2928 if err == errLastSeqMismatch { 2929 var state StreamState 2930 mset.store.FastState(&state) 2931 2932 // If we have no msgs and the other side is delivering us a sequence past where we 2933 // should be reset. This is possible if the other side has a stale snapshot and no longer 2934 // has those messages. So compact and retry to reset. 2935 if state.Msgs == 0 { 2936 mset.store.Compact(lseq + 1) 2937 // Retry 2938 err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt) 2939 } 2940 } 2941 2942 // Only return in place if we are going to reset our stream or we are out of space, or we are closed. 2943 if isClusterResetErr(err) || isOutOfSpaceErr(err) || err == errStreamClosed { 2944 return err 2945 } 2946 s.Debugf("Apply stream entries for '%s > %s' got error processing message: %v", 2947 mset.account(), mset.name(), err) 2948 } 2949 2950 case deleteMsgOp: 2951 md, err := decodeMsgDelete(buf[1:]) 2952 if err != nil { 2953 if node := mset.raftNode(); node != nil { 2954 s := js.srv 2955 s.Errorf("JetStream cluster could not decode delete msg for '%s > %s' [%s]", 2956 mset.account(), mset.name(), node.Group()) 2957 } 2958 panic(err.Error()) 2959 } 2960 s, cc := js.server(), js.cluster 2961 2962 var removed bool 2963 if md.NoErase { 2964 removed, err = mset.removeMsg(md.Seq) 2965 } else { 2966 removed, err = mset.eraseMsg(md.Seq) 2967 } 2968 2969 // Cluster reset error. 2970 if err == ErrStoreEOF { 2971 return err 2972 } 2973 2974 if err != nil && !isRecovering { 2975 s.Debugf("JetStream cluster failed to delete stream msg %d from '%s > %s': %v", 2976 md.Seq, md.Client.serviceAccount(), md.Stream, err) 2977 } 2978 2979 js.mu.RLock() 2980 isLeader := cc.isStreamLeader(md.Client.serviceAccount(), md.Stream) 2981 js.mu.RUnlock() 2982 2983 if isLeader && !isRecovering { 2984 var resp = JSApiMsgDeleteResponse{ApiResponse: ApiResponse{Type: JSApiMsgDeleteResponseType}} 2985 if err != nil { 2986 resp.Error = NewJSStreamMsgDeleteFailedError(err, Unless(err)) 2987 s.sendAPIErrResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp)) 2988 } else if !removed { 2989 resp.Error = NewJSSequenceNotFoundError(md.Seq) 2990 s.sendAPIErrResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp)) 2991 } else { 2992 resp.Success = true 2993 s.sendAPIResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp)) 2994 } 2995 } 2996 case purgeStreamOp: 2997 sp, err := decodeStreamPurge(buf[1:]) 2998 if err != nil { 2999 if node := mset.raftNode(); node != nil { 3000 s := js.srv 3001 s.Errorf("JetStream cluster could not decode purge msg for '%s > %s' [%s]", 3002 mset.account(), mset.name(), node.Group()) 3003 } 3004 panic(err.Error()) 3005 } 3006 // If no explicit request, fill in with leader stamped last sequence to protect ourselves on replay during server start. 3007 if sp.Request == nil || sp.Request.Sequence == 0 { 3008 purgeSeq := sp.LastSeq + 1 3009 if sp.Request == nil { 3010 sp.Request = &JSApiStreamPurgeRequest{Sequence: purgeSeq} 3011 } else if sp.Request.Keep == 0 { 3012 sp.Request.Sequence = purgeSeq 3013 } else if isRecovering { 3014 continue 3015 } 3016 } 3017 3018 s := js.server() 3019 purged, err := mset.purge(sp.Request) 3020 if err != nil { 3021 s.Warnf("JetStream cluster failed to purge stream %q for account %q: %v", sp.Stream, sp.Client.serviceAccount(), err) 3022 } 3023 3024 js.mu.RLock() 3025 isLeader := js.cluster.isStreamLeader(sp.Client.serviceAccount(), sp.Stream) 3026 js.mu.RUnlock() 3027 3028 if isLeader && !isRecovering { 3029 var resp = JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}} 3030 if err != nil { 3031 resp.Error = NewJSStreamGeneralError(err, Unless(err)) 3032 s.sendAPIErrResponse(sp.Client, mset.account(), sp.Subject, sp.Reply, _EMPTY_, s.jsonResponse(resp)) 3033 } else { 3034 resp.Purged = purged 3035 resp.Success = true 3036 s.sendAPIResponse(sp.Client, mset.account(), sp.Subject, sp.Reply, _EMPTY_, s.jsonResponse(resp)) 3037 } 3038 } 3039 default: 3040 panic(fmt.Sprintf("JetStream Cluster Unknown group entry op type: %v", op)) 3041 } 3042 } else if e.Type == EntrySnapshot { 3043 if mset == nil { 3044 return nil 3045 } 3046 3047 // Everything operates on new replicated state. Will convert legacy snapshots to this for processing. 3048 var ss *StreamReplicatedState 3049 3050 onBadState := func(err error) { 3051 // If we are the leader or recovering, meaning we own the snapshot, 3052 // we should stepdown and clear our raft state since our snapshot is bad. 3053 if isRecovering || mset.IsLeader() { 3054 mset.mu.RLock() 3055 s, accName, streamName := mset.srv, mset.acc.GetName(), mset.cfg.Name 3056 mset.mu.RUnlock() 3057 s.Warnf("Detected bad stream state, resetting '%s > %s'", accName, streamName) 3058 mset.resetClusteredState(err) 3059 } 3060 } 3061 3062 // Check if we are the new binary encoding. 3063 if IsEncodedStreamState(e.Data) { 3064 var err error 3065 ss, err = DecodeStreamState(e.Data) 3066 if err != nil { 3067 onBadState(err) 3068 return err 3069 } 3070 } else { 3071 var snap streamSnapshot 3072 if err := json.Unmarshal(e.Data, &snap); err != nil { 3073 onBadState(err) 3074 return err 3075 } 3076 // Convert over to StreamReplicatedState 3077 ss = &StreamReplicatedState{ 3078 Msgs: snap.Msgs, 3079 Bytes: snap.Bytes, 3080 FirstSeq: snap.FirstSeq, 3081 LastSeq: snap.LastSeq, 3082 Failed: snap.Failed, 3083 } 3084 if len(snap.Deleted) > 0 { 3085 ss.Deleted = append(ss.Deleted, DeleteSlice(snap.Deleted)) 3086 } 3087 } 3088 3089 if !isRecovering && !mset.IsLeader() { 3090 if err := mset.processSnapshot(ss); err != nil { 3091 return err 3092 } 3093 } else if isRecovering { 3094 // On recovery, reset CLFS/FAILED. 3095 mset.setCLFS(ss.Failed) 3096 } 3097 } else if e.Type == EntryRemovePeer { 3098 js.mu.RLock() 3099 var ourID string 3100 if js.cluster != nil && js.cluster.meta != nil { 3101 ourID = js.cluster.meta.ID() 3102 } 3103 js.mu.RUnlock() 3104 // We only need to do processing if this is us. 3105 if peer := string(e.Data); peer == ourID && mset != nil { 3106 // Double check here with the registered stream assignment. 3107 shouldRemove := true 3108 if sa := mset.streamAssignment(); sa != nil && sa.Group != nil { 3109 js.mu.RLock() 3110 shouldRemove = !sa.Group.isMember(ourID) 3111 js.mu.RUnlock() 3112 } 3113 if shouldRemove { 3114 mset.stop(true, false) 3115 } 3116 } 3117 return nil 3118 } 3119 } 3120 return nil 3121 } 3122 3123 // Returns the PeerInfo for all replicas of a raft node. This is different than node.Peers() 3124 // and is used for external facing advisories. 3125 func (s *Server) replicas(node RaftNode) []*PeerInfo { 3126 now := time.Now() 3127 var replicas []*PeerInfo 3128 for _, rp := range node.Peers() { 3129 if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil { 3130 si := sir.(nodeInfo) 3131 pi := &PeerInfo{Peer: rp.ID, Name: si.name, Current: rp.Current, Active: now.Sub(rp.Last), Offline: si.offline, Lag: rp.Lag} 3132 replicas = append(replicas, pi) 3133 } 3134 } 3135 return replicas 3136 } 3137 3138 // Will check our node peers and see if we should remove a peer. 3139 func (js *jetStream) checkPeers(rg *raftGroup) { 3140 js.mu.Lock() 3141 defer js.mu.Unlock() 3142 3143 // FIXME(dlc) - Single replicas? 3144 if rg == nil || rg.node == nil { 3145 return 3146 } 3147 for _, peer := range rg.node.Peers() { 3148 if !rg.isMember(peer.ID) { 3149 rg.node.ProposeRemovePeer(peer.ID) 3150 } 3151 } 3152 } 3153 3154 // Process a leader change for the clustered stream. 3155 func (js *jetStream) processStreamLeaderChange(mset *stream, isLeader bool) { 3156 if mset == nil { 3157 return 3158 } 3159 sa := mset.streamAssignment() 3160 if sa == nil { 3161 return 3162 } 3163 3164 // Clear inflight if we have it. 3165 mset.clMu.Lock() 3166 mset.inflight = nil 3167 mset.clMu.Unlock() 3168 3169 js.mu.Lock() 3170 s, account, err := js.srv, sa.Client.serviceAccount(), sa.err 3171 client, subject, reply := sa.Client, sa.Subject, sa.Reply 3172 hasResponded := sa.responded 3173 sa.responded = true 3174 peers := copyStrings(sa.Group.Peers) 3175 js.mu.Unlock() 3176 3177 streamName := mset.name() 3178 3179 if isLeader { 3180 s.Noticef("JetStream cluster new stream leader for '%s > %s'", account, streamName) 3181 s.sendStreamLeaderElectAdvisory(mset) 3182 // Check for peer removal and process here if needed. 3183 js.checkPeers(sa.Group) 3184 mset.checkAllowMsgCompress(peers) 3185 } else { 3186 // We are stepping down. 3187 // Make sure if we are doing so because we have lost quorum that we send the appropriate advisories. 3188 if node := mset.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second { 3189 s.sendStreamLostQuorumAdvisory(mset) 3190 } 3191 3192 // Clear clseq. If we become leader again, it will be fixed up 3193 // automatically on the next processClusteredInboundMsg call. 3194 mset.clMu.Lock() 3195 if mset.clseq > 0 { 3196 mset.clseq = 0 3197 } 3198 mset.clMu.Unlock() 3199 } 3200 3201 // Tell stream to switch leader status. 3202 mset.setLeader(isLeader) 3203 3204 if !isLeader || hasResponded { 3205 return 3206 } 3207 3208 acc, _ := s.LookupAccount(account) 3209 if acc == nil { 3210 return 3211 } 3212 3213 // Send our response. 3214 var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}} 3215 if err != nil { 3216 resp.Error = NewJSStreamCreateError(err, Unless(err)) 3217 s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 3218 } else { 3219 resp.StreamInfo = &StreamInfo{ 3220 Created: mset.createdTime(), 3221 State: mset.state(), 3222 Config: mset.config(), 3223 Cluster: js.clusterInfo(mset.raftGroup()), 3224 Sources: mset.sourcesInfo(), 3225 Mirror: mset.mirrorInfo(), 3226 TimeStamp: time.Now().UTC(), 3227 } 3228 resp.DidCreate = true 3229 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 3230 if node := mset.raftNode(); node != nil { 3231 mset.sendCreateAdvisory() 3232 } 3233 } 3234 } 3235 3236 // Fixed value ok for now. 3237 const lostQuorumAdvInterval = 10 * time.Second 3238 3239 // Determines if we should send lost quorum advisory. We throttle these after first one. 3240 func (mset *stream) shouldSendLostQuorum() bool { 3241 mset.mu.Lock() 3242 defer mset.mu.Unlock() 3243 if time.Since(mset.lqsent) >= lostQuorumAdvInterval { 3244 mset.lqsent = time.Now() 3245 return true 3246 } 3247 return false 3248 } 3249 3250 func (s *Server) sendStreamLostQuorumAdvisory(mset *stream) { 3251 if mset == nil { 3252 return 3253 } 3254 node, stream, acc := mset.raftNode(), mset.name(), mset.account() 3255 if node == nil { 3256 return 3257 } 3258 if !mset.shouldSendLostQuorum() { 3259 return 3260 } 3261 3262 s.Warnf("JetStream cluster stream '%s > %s' has NO quorum, stalled", acc.GetName(), stream) 3263 3264 subj := JSAdvisoryStreamQuorumLostPre + "." + stream 3265 adv := &JSStreamQuorumLostAdvisory{ 3266 TypedEvent: TypedEvent{ 3267 Type: JSStreamQuorumLostAdvisoryType, 3268 ID: nuid.Next(), 3269 Time: time.Now().UTC(), 3270 }, 3271 Stream: stream, 3272 Replicas: s.replicas(node), 3273 Domain: s.getOpts().JetStreamDomain, 3274 } 3275 3276 // Send to the user's account if not the system account. 3277 if acc != s.SystemAccount() { 3278 s.publishAdvisory(acc, subj, adv) 3279 } 3280 // Now do system level one. Place account info in adv, and nil account means system. 3281 adv.Account = acc.GetName() 3282 s.publishAdvisory(nil, subj, adv) 3283 } 3284 3285 func (s *Server) sendStreamLeaderElectAdvisory(mset *stream) { 3286 if mset == nil { 3287 return 3288 } 3289 node, stream, acc := mset.raftNode(), mset.name(), mset.account() 3290 if node == nil { 3291 return 3292 } 3293 subj := JSAdvisoryStreamLeaderElectedPre + "." + stream 3294 adv := &JSStreamLeaderElectedAdvisory{ 3295 TypedEvent: TypedEvent{ 3296 Type: JSStreamLeaderElectedAdvisoryType, 3297 ID: nuid.Next(), 3298 Time: time.Now().UTC(), 3299 }, 3300 Stream: stream, 3301 Leader: s.serverNameForNode(node.GroupLeader()), 3302 Replicas: s.replicas(node), 3303 Domain: s.getOpts().JetStreamDomain, 3304 } 3305 3306 // Send to the user's account if not the system account. 3307 if acc != s.SystemAccount() { 3308 s.publishAdvisory(acc, subj, adv) 3309 } 3310 // Now do system level one. Place account info in adv, and nil account means system. 3311 adv.Account = acc.GetName() 3312 s.publishAdvisory(nil, subj, adv) 3313 } 3314 3315 // Will lookup a stream assignment. 3316 // Lock should be held. 3317 func (js *jetStream) streamAssignment(account, stream string) (sa *streamAssignment) { 3318 cc := js.cluster 3319 if cc == nil { 3320 return nil 3321 } 3322 3323 if as := cc.streams[account]; as != nil { 3324 sa = as[stream] 3325 } 3326 return sa 3327 } 3328 3329 // processStreamAssignment is called when followers have replicated an assignment. 3330 func (js *jetStream) processStreamAssignment(sa *streamAssignment) bool { 3331 js.mu.Lock() 3332 s, cc := js.srv, js.cluster 3333 accName, stream := sa.Client.serviceAccount(), sa.Config.Name 3334 noMeta := cc == nil || cc.meta == nil 3335 var ourID string 3336 if !noMeta { 3337 ourID = cc.meta.ID() 3338 } 3339 var isMember bool 3340 if sa.Group != nil && ourID != _EMPTY_ { 3341 isMember = sa.Group.isMember(ourID) 3342 } 3343 3344 // Remove this stream from the inflight proposals 3345 cc.removeInflightProposal(accName, sa.Config.Name) 3346 3347 if s == nil || noMeta { 3348 js.mu.Unlock() 3349 return false 3350 } 3351 3352 accStreams := cc.streams[accName] 3353 if accStreams == nil { 3354 accStreams = make(map[string]*streamAssignment) 3355 } else if osa := accStreams[stream]; osa != nil && osa != sa { 3356 // Copy over private existing state from former SA. 3357 if sa.Group != nil { 3358 sa.Group.node = osa.Group.node 3359 } 3360 sa.consumers = osa.consumers 3361 sa.responded = osa.responded 3362 sa.err = osa.err 3363 } 3364 3365 // Update our state. 3366 accStreams[stream] = sa 3367 cc.streams[accName] = accStreams 3368 hasResponded := sa.responded 3369 js.mu.Unlock() 3370 3371 acc, err := s.LookupAccount(accName) 3372 if err != nil { 3373 ll := fmt.Sprintf("Account [%s] lookup for stream create failed: %v", accName, err) 3374 if isMember { 3375 if !hasResponded { 3376 // If we can not lookup the account and we are a member, send this result back to the metacontroller leader. 3377 result := &streamAssignmentResult{ 3378 Account: accName, 3379 Stream: stream, 3380 Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}, 3381 } 3382 result.Response.Error = NewJSNoAccountError() 3383 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result) 3384 } 3385 s.Warnf(ll) 3386 } else { 3387 s.Debugf(ll) 3388 } 3389 return false 3390 } 3391 3392 var didRemove bool 3393 3394 // Check if this is for us.. 3395 if isMember { 3396 js.processClusterCreateStream(acc, sa) 3397 } else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { 3398 // We have one here even though we are not a member. This can happen on re-assignment. 3399 s.removeStream(ourID, mset, sa) 3400 } 3401 3402 // If this stream assignment does not have a sync subject (bug) set that the meta-leader should check when elected. 3403 if sa.Sync == _EMPTY_ { 3404 js.mu.Lock() 3405 cc.streamsCheck = true 3406 js.mu.Unlock() 3407 return false 3408 } 3409 3410 return didRemove 3411 } 3412 3413 // processUpdateStreamAssignment is called when followers have replicated an updated assignment. 3414 func (js *jetStream) processUpdateStreamAssignment(sa *streamAssignment) { 3415 js.mu.RLock() 3416 s, cc := js.srv, js.cluster 3417 js.mu.RUnlock() 3418 if s == nil || cc == nil { 3419 // TODO(dlc) - debug at least 3420 return 3421 } 3422 3423 accName := sa.Client.serviceAccount() 3424 stream := sa.Config.Name 3425 3426 js.mu.Lock() 3427 if cc.meta == nil { 3428 js.mu.Unlock() 3429 return 3430 } 3431 ourID := cc.meta.ID() 3432 3433 var isMember bool 3434 if sa.Group != nil { 3435 isMember = sa.Group.isMember(ourID) 3436 } 3437 3438 accStreams := cc.streams[accName] 3439 if accStreams == nil { 3440 js.mu.Unlock() 3441 return 3442 } 3443 osa := accStreams[stream] 3444 if osa == nil { 3445 js.mu.Unlock() 3446 return 3447 } 3448 3449 // Copy over private existing state from former SA. 3450 if sa.Group != nil { 3451 sa.Group.node = osa.Group.node 3452 } 3453 sa.consumers = osa.consumers 3454 sa.err = osa.err 3455 3456 // If we detect we are scaling down to 1, non-clustered, and we had a previous node, clear it here. 3457 if sa.Config.Replicas == 1 && sa.Group.node != nil { 3458 sa.Group.node = nil 3459 } 3460 3461 // Update our state. 3462 accStreams[stream] = sa 3463 cc.streams[accName] = accStreams 3464 3465 // Make sure we respond if we are a member. 3466 if isMember { 3467 sa.responded = false 3468 } else { 3469 // Make sure to clean up any old node in case this stream moves back here. 3470 if sa.Group != nil { 3471 sa.Group.node = nil 3472 } 3473 } 3474 js.mu.Unlock() 3475 3476 acc, err := s.LookupAccount(accName) 3477 if err != nil { 3478 s.Warnf("Update Stream Account %s, error on lookup: %v", accName, err) 3479 return 3480 } 3481 3482 // Check if this is for us.. 3483 if isMember { 3484 js.processClusterUpdateStream(acc, osa, sa) 3485 } else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { 3486 // We have one here even though we are not a member. This can happen on re-assignment. 3487 s.removeStream(ourID, mset, sa) 3488 } 3489 } 3490 3491 // Common function to remove ourself from this server. 3492 // This can happen on re-assignment, move, etc 3493 func (s *Server) removeStream(ourID string, mset *stream, nsa *streamAssignment) { 3494 if mset == nil { 3495 return 3496 } 3497 // Make sure to use the new stream assignment, not our own. 3498 s.Debugf("JetStream removing stream '%s > %s' from this server", nsa.Client.serviceAccount(), nsa.Config.Name) 3499 if node := mset.raftNode(); node != nil { 3500 if node.Leader() { 3501 node.StepDown(nsa.Group.Preferred) 3502 } 3503 node.ProposeRemovePeer(ourID) 3504 // shutdown monitor by shutting down raft. 3505 node.Delete() 3506 } 3507 3508 var isShuttingDown bool 3509 // Make sure this node is no longer attached to our stream assignment. 3510 if js, _ := s.getJetStreamCluster(); js != nil { 3511 js.mu.Lock() 3512 nsa.Group.node = nil 3513 isShuttingDown = js.shuttingDown 3514 js.mu.Unlock() 3515 } 3516 3517 if !isShuttingDown { 3518 // wait for monitor to be shutdown. 3519 mset.monitorWg.Wait() 3520 } 3521 mset.stop(true, false) 3522 } 3523 3524 // processClusterUpdateStream is called when we have a stream assignment that 3525 // has been updated for an existing assignment and we are a member. 3526 func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAssignment) { 3527 if sa == nil { 3528 return 3529 } 3530 3531 js.mu.Lock() 3532 s, rg := js.srv, sa.Group 3533 client, subject, reply := sa.Client, sa.Subject, sa.Reply 3534 alreadyRunning, numReplicas := osa.Group.node != nil, len(rg.Peers) 3535 needsNode := rg.node == nil 3536 storage, cfg := sa.Config.Storage, sa.Config 3537 hasResponded := sa.responded 3538 sa.responded = true 3539 recovering := sa.recovering 3540 js.mu.Unlock() 3541 3542 mset, err := acc.lookupStream(cfg.Name) 3543 if err == nil && mset != nil { 3544 // Make sure we have not had a new group assigned to us. 3545 if osa.Group.Name != sa.Group.Name { 3546 s.Warnf("JetStream cluster detected stream remapping for '%s > %s' from %q to %q", 3547 acc, cfg.Name, osa.Group.Name, sa.Group.Name) 3548 mset.removeNode() 3549 alreadyRunning, needsNode = false, true 3550 // Make sure to clear from original. 3551 js.mu.Lock() 3552 osa.Group.node = nil 3553 js.mu.Unlock() 3554 } 3555 3556 var needsSetLeader bool 3557 if !alreadyRunning && numReplicas > 1 { 3558 if needsNode { 3559 mset.setLeader(false) 3560 js.createRaftGroup(acc.GetName(), rg, storage, pprofLabels{ 3561 "type": "stream", 3562 "account": mset.accName(), 3563 "stream": mset.name(), 3564 }) 3565 } 3566 mset.monitorWg.Add(1) 3567 // Start monitoring.. 3568 s.startGoRoutine( 3569 func() { js.monitorStream(mset, sa, needsNode) }, 3570 pprofLabels{ 3571 "type": "stream", 3572 "account": mset.accName(), 3573 "stream": mset.name(), 3574 }, 3575 ) 3576 } else if numReplicas == 1 && alreadyRunning { 3577 // We downgraded to R1. Make sure we cleanup the raft node and the stream monitor. 3578 mset.removeNode() 3579 // Make sure we are leader now that we are R1. 3580 needsSetLeader = true 3581 // In case we need to shutdown the cluster specific subs, etc. 3582 mset.setLeader(false) 3583 js.mu.Lock() 3584 rg.node = nil 3585 js.mu.Unlock() 3586 } 3587 // Call update. 3588 if err = mset.updateWithAdvisory(cfg, !recovering); err != nil { 3589 s.Warnf("JetStream cluster error updating stream %q for account %q: %v", cfg.Name, acc.Name, err) 3590 } 3591 // Set the new stream assignment. 3592 mset.setStreamAssignment(sa) 3593 // Make sure we are the leader now that we are R1. 3594 if needsSetLeader { 3595 mset.setLeader(true) 3596 } 3597 } 3598 3599 // If not found we must be expanding into this node since if we are here we know we are a member. 3600 if err == ErrJetStreamStreamNotFound { 3601 js.processStreamAssignment(sa) 3602 return 3603 } 3604 3605 if err != nil { 3606 js.mu.Lock() 3607 sa.err = err 3608 result := &streamAssignmentResult{ 3609 Account: sa.Client.serviceAccount(), 3610 Stream: sa.Config.Name, 3611 Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}, 3612 Update: true, 3613 } 3614 result.Response.Error = NewJSStreamGeneralError(err, Unless(err)) 3615 js.mu.Unlock() 3616 3617 // Send response to the metadata leader. They will forward to the user as needed. 3618 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result) 3619 return 3620 } 3621 3622 isLeader := mset.IsLeader() 3623 3624 // Check for missing syncSubject bug. 3625 if isLeader && osa != nil && osa.Sync == _EMPTY_ { 3626 if node := mset.raftNode(); node != nil { 3627 node.StepDown() 3628 } 3629 return 3630 } 3631 3632 // If we were a single node being promoted assume leadership role for purpose of responding. 3633 if !hasResponded && !isLeader && !alreadyRunning { 3634 isLeader = true 3635 } 3636 3637 // Check if we should bail. 3638 if !isLeader || hasResponded || recovering { 3639 return 3640 } 3641 3642 // Send our response. 3643 var resp = JSApiStreamUpdateResponse{ApiResponse: ApiResponse{Type: JSApiStreamUpdateResponseType}} 3644 resp.StreamInfo = &StreamInfo{ 3645 Created: mset.createdTime(), 3646 State: mset.state(), 3647 Config: mset.config(), 3648 Cluster: js.clusterInfo(mset.raftGroup()), 3649 Mirror: mset.mirrorInfo(), 3650 Sources: mset.sourcesInfo(), 3651 TimeStamp: time.Now().UTC(), 3652 } 3653 3654 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 3655 } 3656 3657 // processClusterCreateStream is called when we have a stream assignment that 3658 // has been committed and this server is a member of the peer group. 3659 func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignment) { 3660 if sa == nil { 3661 return 3662 } 3663 3664 js.mu.RLock() 3665 s, rg := js.srv, sa.Group 3666 alreadyRunning := rg.node != nil 3667 storage := sa.Config.Storage 3668 restore := sa.Restore 3669 js.mu.RUnlock() 3670 3671 // Process the raft group and make sure it's running if needed. 3672 err := js.createRaftGroup(acc.GetName(), rg, storage, pprofLabels{ 3673 "type": "stream", 3674 "account": acc.Name, 3675 "stream": sa.Config.Name, 3676 }) 3677 3678 // If we are restoring, create the stream if we are R>1 and not the preferred who handles the 3679 // receipt of the snapshot itself. 3680 shouldCreate := true 3681 if restore != nil { 3682 if len(rg.Peers) == 1 || rg.node != nil && rg.node.ID() == rg.Preferred { 3683 shouldCreate = false 3684 } else { 3685 js.mu.Lock() 3686 sa.Restore = nil 3687 js.mu.Unlock() 3688 } 3689 } 3690 3691 // Our stream. 3692 var mset *stream 3693 3694 // Process here if not restoring or not the leader. 3695 if shouldCreate && err == nil { 3696 // Go ahead and create or update the stream. 3697 mset, err = acc.lookupStream(sa.Config.Name) 3698 if err == nil && mset != nil { 3699 osa := mset.streamAssignment() 3700 // If we already have a stream assignment and they are the same exact config, short circuit here. 3701 if osa != nil { 3702 if reflect.DeepEqual(osa.Config, sa.Config) { 3703 if sa.Group.Name == osa.Group.Name && reflect.DeepEqual(sa.Group.Peers, osa.Group.Peers) { 3704 // Since this already exists we know it succeeded, just respond to this caller. 3705 js.mu.RLock() 3706 client, subject, reply, recovering := sa.Client, sa.Subject, sa.Reply, sa.recovering 3707 js.mu.RUnlock() 3708 3709 if !recovering { 3710 var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}} 3711 resp.StreamInfo = &StreamInfo{ 3712 Created: mset.createdTime(), 3713 State: mset.state(), 3714 Config: mset.config(), 3715 Cluster: js.clusterInfo(mset.raftGroup()), 3716 Sources: mset.sourcesInfo(), 3717 Mirror: mset.mirrorInfo(), 3718 TimeStamp: time.Now().UTC(), 3719 } 3720 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 3721 } 3722 return 3723 } else { 3724 // We had a bug where we could have multiple assignments for the same 3725 // stream but with different group assignments, including multiple raft 3726 // groups. So check for that here. We can only bet on the last one being 3727 // consistent in the long run, so let it continue if we see this condition. 3728 s.Warnf("JetStream cluster detected duplicate assignment for stream %q for account %q", sa.Config.Name, acc.Name) 3729 if osa.Group.node != nil && osa.Group.node != sa.Group.node { 3730 osa.Group.node.Delete() 3731 osa.Group.node = nil 3732 } 3733 } 3734 } 3735 } 3736 mset.setStreamAssignment(sa) 3737 // Check if our config has really been updated. 3738 if !reflect.DeepEqual(mset.config(), sa.Config) { 3739 if err = mset.updateWithAdvisory(sa.Config, false); err != nil { 3740 s.Warnf("JetStream cluster error updating stream %q for account %q: %v", sa.Config.Name, acc.Name, err) 3741 if osa != nil { 3742 // Process the raft group and make sure it's running if needed. 3743 js.createRaftGroup(acc.GetName(), osa.Group, storage, pprofLabels{ 3744 "type": "stream", 3745 "account": mset.accName(), 3746 "stream": mset.name(), 3747 }) 3748 mset.setStreamAssignment(osa) 3749 } 3750 if rg.node != nil { 3751 rg.node.Delete() 3752 rg.node = nil 3753 } 3754 } 3755 } 3756 } else if err == NewJSStreamNotFoundError() { 3757 // Add in the stream here. 3758 mset, err = acc.addStreamWithAssignment(sa.Config, nil, sa) 3759 } 3760 if mset != nil { 3761 mset.setCreatedTime(sa.Created) 3762 } 3763 } 3764 3765 // This is an error condition. 3766 if err != nil { 3767 if IsNatsErr(err, JSStreamStoreFailedF) { 3768 s.Warnf("Stream create failed for '%s > %s': %v", sa.Client.serviceAccount(), sa.Config.Name, err) 3769 err = errStreamStoreFailed 3770 } 3771 js.mu.Lock() 3772 3773 sa.err = err 3774 hasResponded := sa.responded 3775 3776 // If out of space do nothing for now. 3777 if isOutOfSpaceErr(err) { 3778 hasResponded = true 3779 } 3780 3781 if rg.node != nil { 3782 rg.node.Delete() 3783 } 3784 3785 var result *streamAssignmentResult 3786 if !hasResponded { 3787 result = &streamAssignmentResult{ 3788 Account: sa.Client.serviceAccount(), 3789 Stream: sa.Config.Name, 3790 Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}, 3791 } 3792 result.Response.Error = NewJSStreamCreateError(err, Unless(err)) 3793 } 3794 js.mu.Unlock() 3795 3796 // Send response to the metadata leader. They will forward to the user as needed. 3797 if result != nil { 3798 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result) 3799 } 3800 return 3801 } 3802 3803 // Re-capture node. 3804 js.mu.RLock() 3805 node := rg.node 3806 js.mu.RUnlock() 3807 3808 // Start our monitoring routine. 3809 if node != nil { 3810 if !alreadyRunning { 3811 if mset != nil { 3812 mset.monitorWg.Add(1) 3813 } 3814 s.startGoRoutine( 3815 func() { js.monitorStream(mset, sa, false) }, 3816 pprofLabels{ 3817 "type": "stream", 3818 "account": mset.accName(), 3819 "stream": mset.name(), 3820 }, 3821 ) 3822 } 3823 } else { 3824 // Single replica stream, process manually here. 3825 // If we are restoring, process that first. 3826 if sa.Restore != nil { 3827 // We are restoring a stream here. 3828 restoreDoneCh := s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_) 3829 s.startGoRoutine(func() { 3830 defer s.grWG.Done() 3831 select { 3832 case err := <-restoreDoneCh: 3833 if err == nil { 3834 mset, err = acc.lookupStream(sa.Config.Name) 3835 if mset != nil { 3836 mset.setStreamAssignment(sa) 3837 mset.setCreatedTime(sa.Created) 3838 } 3839 } 3840 if err != nil { 3841 if mset != nil { 3842 mset.delete() 3843 } 3844 js.mu.Lock() 3845 sa.err = err 3846 result := &streamAssignmentResult{ 3847 Account: sa.Client.serviceAccount(), 3848 Stream: sa.Config.Name, 3849 Restore: &JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}}, 3850 } 3851 result.Restore.Error = NewJSStreamRestoreError(err, Unless(err)) 3852 js.mu.Unlock() 3853 // Send response to the metadata leader. They will forward to the user as needed. 3854 b, _ := json.Marshal(result) // Avoids auto-processing and doing fancy json with newlines. 3855 s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, b) 3856 return 3857 } 3858 js.processStreamLeaderChange(mset, true) 3859 3860 // Check to see if we have restored consumers here. 3861 // These are not currently assigned so we will need to do so here. 3862 if consumers := mset.getPublicConsumers(); len(consumers) > 0 { 3863 js.mu.RLock() 3864 cc := js.cluster 3865 js.mu.RUnlock() 3866 3867 for _, o := range consumers { 3868 name, cfg := o.String(), o.config() 3869 rg := cc.createGroupForConsumer(&cfg, sa) 3870 3871 // Place our initial state here as well for assignment distribution. 3872 ca := &consumerAssignment{ 3873 Group: rg, 3874 Stream: sa.Config.Name, 3875 Name: name, 3876 Config: &cfg, 3877 Client: sa.Client, 3878 Created: o.createdTime(), 3879 } 3880 3881 addEntry := encodeAddConsumerAssignment(ca) 3882 cc.meta.ForwardProposal(addEntry) 3883 3884 // Check to make sure we see the assignment. 3885 go func() { 3886 ticker := time.NewTicker(time.Second) 3887 defer ticker.Stop() 3888 for range ticker.C { 3889 js.mu.RLock() 3890 ca, meta := js.consumerAssignment(ca.Client.serviceAccount(), sa.Config.Name, name), cc.meta 3891 js.mu.RUnlock() 3892 if ca == nil { 3893 s.Warnf("Consumer assignment has not been assigned, retrying") 3894 if meta != nil { 3895 meta.ForwardProposal(addEntry) 3896 } else { 3897 return 3898 } 3899 } else { 3900 return 3901 } 3902 } 3903 }() 3904 } 3905 } 3906 case <-s.quitCh: 3907 return 3908 } 3909 }) 3910 } else { 3911 js.processStreamLeaderChange(mset, true) 3912 } 3913 } 3914 } 3915 3916 // processStreamRemoval is called when followers have replicated an assignment. 3917 func (js *jetStream) processStreamRemoval(sa *streamAssignment) { 3918 js.mu.Lock() 3919 s, cc := js.srv, js.cluster 3920 if s == nil || cc == nil || cc.meta == nil { 3921 // TODO(dlc) - debug at least 3922 js.mu.Unlock() 3923 return 3924 } 3925 stream := sa.Config.Name 3926 isMember := sa.Group.isMember(cc.meta.ID()) 3927 wasLeader := cc.isStreamLeader(sa.Client.serviceAccount(), stream) 3928 3929 // Check if we already have this assigned. 3930 accStreams := cc.streams[sa.Client.serviceAccount()] 3931 needDelete := accStreams != nil && accStreams[stream] != nil 3932 if needDelete { 3933 delete(accStreams, stream) 3934 if len(accStreams) == 0 { 3935 delete(cc.streams, sa.Client.serviceAccount()) 3936 } 3937 } 3938 js.mu.Unlock() 3939 3940 if needDelete { 3941 js.processClusterDeleteStream(sa, isMember, wasLeader) 3942 } 3943 } 3944 3945 func (js *jetStream) processClusterDeleteStream(sa *streamAssignment, isMember, wasLeader bool) { 3946 if sa == nil { 3947 return 3948 } 3949 js.mu.RLock() 3950 s := js.srv 3951 node := sa.Group.node 3952 hadLeader := node == nil || node.GroupLeader() != noLeader 3953 offline := s.allPeersOffline(sa.Group) 3954 var isMetaLeader bool 3955 if cc := js.cluster; cc != nil { 3956 isMetaLeader = cc.isLeader() 3957 } 3958 recovering := sa.recovering 3959 js.mu.RUnlock() 3960 3961 stopped := false 3962 var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}} 3963 var err error 3964 var acc *Account 3965 3966 // Go ahead and delete the stream if we have it and the account here. 3967 if acc, _ = s.LookupAccount(sa.Client.serviceAccount()); acc != nil { 3968 if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { 3969 // shut down monitor by shutting down raft 3970 if n := mset.raftNode(); n != nil { 3971 n.Delete() 3972 } 3973 // wait for monitor to be shut down 3974 mset.monitorWg.Wait() 3975 err = mset.stop(true, wasLeader) 3976 stopped = true 3977 } else if isMember { 3978 s.Warnf("JetStream failed to lookup running stream while removing stream '%s > %s' from this server", 3979 sa.Client.serviceAccount(), sa.Config.Name) 3980 } 3981 } else if isMember { 3982 s.Warnf("JetStream failed to lookup account while removing stream '%s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name) 3983 } 3984 3985 // Always delete the node if present. 3986 if node != nil { 3987 node.Delete() 3988 } 3989 3990 // This is a stop gap cleanup in case 3991 // 1) the account does not exist (and mset couldn't be stopped) and/or 3992 // 2) node was nil (and couldn't be deleted) 3993 if !stopped || node == nil { 3994 if sacc := s.SystemAccount(); sacc != nil { 3995 saccName := sacc.GetName() 3996 os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, sa.Group.Name)) 3997 // cleanup dependent consumer groups 3998 if !stopped { 3999 for _, ca := range sa.consumers { 4000 // Make sure we cleanup any possible running nodes for the consumers. 4001 if isMember && ca.Group != nil && ca.Group.node != nil { 4002 ca.Group.node.Delete() 4003 } 4004 os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, ca.Group.Name)) 4005 } 4006 } 4007 } 4008 } 4009 accDir := filepath.Join(js.config.StoreDir, sa.Client.serviceAccount()) 4010 streamDir := filepath.Join(accDir, streamsDir) 4011 os.RemoveAll(filepath.Join(streamDir, sa.Config.Name)) 4012 4013 // no op if not empty 4014 os.Remove(streamDir) 4015 os.Remove(accDir) 4016 4017 // Normally we want only the leader to respond here, but if we had no leader then all members will respond to make 4018 // sure we get feedback to the user. 4019 if !isMember || (hadLeader && !wasLeader) { 4020 // If all the peers are offline and we are the meta leader we will also respond, so suppress returning here. 4021 if !(offline && isMetaLeader) { 4022 return 4023 } 4024 } 4025 4026 // Do not respond if the account does not exist any longer 4027 if acc == nil || recovering { 4028 return 4029 } 4030 4031 if err != nil { 4032 resp.Error = NewJSStreamGeneralError(err, Unless(err)) 4033 s.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp)) 4034 } else { 4035 resp.Success = true 4036 s.sendAPIResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp)) 4037 } 4038 } 4039 4040 // processConsumerAssignment is called when followers have replicated an assignment for a consumer. 4041 func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { 4042 js.mu.RLock() 4043 s, cc := js.srv, js.cluster 4044 accName, stream, consumerName := ca.Client.serviceAccount(), ca.Stream, ca.Name 4045 noMeta := cc == nil || cc.meta == nil 4046 shuttingDown := js.shuttingDown 4047 var ourID string 4048 if !noMeta { 4049 ourID = cc.meta.ID() 4050 } 4051 var isMember bool 4052 if ca.Group != nil && ourID != _EMPTY_ { 4053 isMember = ca.Group.isMember(ourID) 4054 } 4055 js.mu.RUnlock() 4056 4057 if s == nil || noMeta || shuttingDown { 4058 return 4059 } 4060 4061 sa := js.streamAssignment(accName, stream) 4062 if sa == nil { 4063 s.Debugf("Consumer create failed, could not locate stream '%s > %s'", accName, stream) 4064 return 4065 } 4066 4067 // Might need this below. 4068 numReplicas := sa.Config.Replicas 4069 4070 // Track if this existed already. 4071 var wasExisting bool 4072 4073 // Check if we have an existing consumer assignment. 4074 js.mu.Lock() 4075 if sa.consumers == nil { 4076 sa.consumers = make(map[string]*consumerAssignment) 4077 } else if oca := sa.consumers[ca.Name]; oca != nil { 4078 wasExisting = true 4079 // Copy over private existing state from former SA. 4080 if ca.Group != nil { 4081 ca.Group.node = oca.Group.node 4082 } 4083 ca.responded = oca.responded 4084 ca.err = oca.err 4085 } 4086 4087 // Capture the optional state. We will pass it along if we are a member to apply. 4088 // This is only applicable when restoring a stream with consumers. 4089 state := ca.State 4090 ca.State = nil 4091 4092 // Place into our internal map under the stream assignment. 4093 // Ok to replace an existing one, we check on process call below. 4094 sa.consumers[ca.Name] = ca 4095 js.mu.Unlock() 4096 4097 acc, err := s.LookupAccount(accName) 4098 if err != nil { 4099 ll := fmt.Sprintf("Account [%s] lookup for consumer create failed: %v", accName, err) 4100 if isMember { 4101 if !js.isMetaRecovering() { 4102 // If we can not lookup the account and we are a member, send this result back to the metacontroller leader. 4103 result := &consumerAssignmentResult{ 4104 Account: accName, 4105 Stream: stream, 4106 Consumer: consumerName, 4107 Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, 4108 } 4109 result.Response.Error = NewJSNoAccountError() 4110 s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result) 4111 } 4112 s.Warnf(ll) 4113 } else { 4114 s.Debugf(ll) 4115 } 4116 return 4117 } 4118 4119 // Check if this is for us.. 4120 if isMember { 4121 js.processClusterCreateConsumer(ca, state, wasExisting) 4122 } else { 4123 // We need to be removed here, we are no longer assigned. 4124 // Grab consumer if we have it. 4125 var o *consumer 4126 if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil { 4127 o = mset.lookupConsumer(ca.Name) 4128 } 4129 4130 // Check if we have a raft node running, meaning we are no longer part of the group but were. 4131 js.mu.Lock() 4132 if node := ca.Group.node; node != nil { 4133 // We have one here even though we are not a member. This can happen on re-assignment. 4134 s.Debugf("JetStream removing consumer '%s > %s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name, ca.Name) 4135 if node.Leader() { 4136 s.Debugf("JetStream consumer '%s > %s > %s' is being removed and was the leader, will perform stepdown", 4137 sa.Client.serviceAccount(), sa.Config.Name, ca.Name) 4138 4139 peers, cn := node.Peers(), s.cachedClusterName() 4140 migrating := numReplicas != len(peers) 4141 4142 // Select a new peer to transfer to. If we are a migrating make sure its from the new cluster. 4143 var npeer string 4144 for _, r := range peers { 4145 if !r.Current { 4146 continue 4147 } 4148 if !migrating { 4149 npeer = r.ID 4150 break 4151 } else if sir, ok := s.nodeToInfo.Load(r.ID); ok && sir != nil { 4152 si := sir.(nodeInfo) 4153 if si.cluster != cn { 4154 npeer = r.ID 4155 break 4156 } 4157 } 4158 } 4159 // Clear the raftnode from our consumer so that a subsequent o.delete will not also issue a stepdown. 4160 if o != nil { 4161 o.clearRaftNode() 4162 } 4163 // Manually handle the stepdown and deletion of the node. 4164 node.UpdateKnownPeers(ca.Group.Peers) 4165 node.StepDown(npeer) 4166 node.Delete() 4167 } else { 4168 node.UpdateKnownPeers(ca.Group.Peers) 4169 } 4170 } 4171 // Always clear the old node. 4172 ca.Group.node = nil 4173 ca.err = nil 4174 js.mu.Unlock() 4175 4176 if o != nil { 4177 o.deleteWithoutAdvisory() 4178 } 4179 } 4180 } 4181 4182 func (js *jetStream) processConsumerRemoval(ca *consumerAssignment) { 4183 js.mu.Lock() 4184 s, cc := js.srv, js.cluster 4185 if s == nil || cc == nil || cc.meta == nil { 4186 // TODO(dlc) - debug at least 4187 js.mu.Unlock() 4188 return 4189 } 4190 wasLeader := cc.isConsumerLeader(ca.Client.serviceAccount(), ca.Stream, ca.Name) 4191 4192 // Delete from our state. 4193 var needDelete bool 4194 if accStreams := cc.streams[ca.Client.serviceAccount()]; accStreams != nil { 4195 if sa := accStreams[ca.Stream]; sa != nil && sa.consumers != nil && sa.consumers[ca.Name] != nil { 4196 oca := sa.consumers[ca.Name] 4197 // Make sure this removal is for what we have, otherwise ignore. 4198 if ca.Group != nil && oca.Group != nil && ca.Group.Name == oca.Group.Name { 4199 needDelete = true 4200 oca.deleted = true 4201 delete(sa.consumers, ca.Name) 4202 } 4203 } 4204 } 4205 js.mu.Unlock() 4206 4207 if needDelete { 4208 js.processClusterDeleteConsumer(ca, wasLeader) 4209 } 4210 } 4211 4212 type consumerAssignmentResult struct { 4213 Account string `json:"account"` 4214 Stream string `json:"stream"` 4215 Consumer string `json:"consumer"` 4216 Response *JSApiConsumerCreateResponse `json:"response,omitempty"` 4217 } 4218 4219 // processClusterCreateConsumer is when we are a member of the group and need to create the consumer. 4220 func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state *ConsumerState, wasExisting bool) { 4221 if ca == nil { 4222 return 4223 } 4224 js.mu.RLock() 4225 s := js.srv 4226 rg := ca.Group 4227 alreadyRunning := rg != nil && rg.node != nil 4228 accName, stream, consumer := ca.Client.serviceAccount(), ca.Stream, ca.Name 4229 js.mu.RUnlock() 4230 4231 acc, err := s.LookupAccount(accName) 4232 if err != nil { 4233 s.Warnf("JetStream cluster failed to lookup axccount %q: %v", accName, err) 4234 return 4235 } 4236 4237 // Go ahead and create or update the consumer. 4238 mset, err := acc.lookupStream(stream) 4239 if err != nil { 4240 if !js.isMetaRecovering() { 4241 js.mu.Lock() 4242 s.Warnf("Consumer create failed, could not locate stream '%s > %s > %s'", ca.Client.serviceAccount(), ca.Stream, ca.Name) 4243 ca.err = NewJSStreamNotFoundError() 4244 result := &consumerAssignmentResult{ 4245 Account: ca.Client.serviceAccount(), 4246 Stream: ca.Stream, 4247 Consumer: ca.Name, 4248 Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, 4249 } 4250 result.Response.Error = NewJSStreamNotFoundError() 4251 s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result) 4252 js.mu.Unlock() 4253 } 4254 return 4255 } 4256 4257 // Check if we already have this consumer running. 4258 o := mset.lookupConsumer(consumer) 4259 4260 if !alreadyRunning { 4261 // Process the raft group and make sure its running if needed. 4262 storage := mset.config().Storage 4263 if ca.Config.MemoryStorage { 4264 storage = MemoryStorage 4265 } 4266 // No-op if R1. 4267 js.createRaftGroup(accName, rg, storage, pprofLabels{ 4268 "type": "consumer", 4269 "account": mset.accName(), 4270 "stream": ca.Stream, 4271 "consumer": ca.Name, 4272 }) 4273 } else { 4274 // If we are clustered update the known peers. 4275 js.mu.RLock() 4276 if node := rg.node; node != nil { 4277 node.UpdateKnownPeers(ca.Group.Peers) 4278 } 4279 js.mu.RUnlock() 4280 } 4281 4282 // Check if we already have this consumer running. 4283 var didCreate, isConfigUpdate, needsLocalResponse bool 4284 if o == nil { 4285 // Add in the consumer if needed. 4286 if o, err = mset.addConsumerWithAssignment(ca.Config, ca.Name, ca, js.isMetaRecovering(), ActionCreateOrUpdate); err == nil { 4287 didCreate = true 4288 } 4289 } else { 4290 // This consumer exists. 4291 // Only update if config is really different. 4292 cfg := o.config() 4293 if isConfigUpdate = !reflect.DeepEqual(&cfg, ca.Config); isConfigUpdate { 4294 // Call into update, ignore consumer exists error here since this means an old deliver subject is bound 4295 // which can happen on restart etc. 4296 if err := o.updateConfig(ca.Config); err != nil && err != NewJSConsumerNameExistError() { 4297 // This is essentially an update that has failed. Respond back to metaleader if we are not recovering. 4298 js.mu.RLock() 4299 if !js.metaRecovering { 4300 result := &consumerAssignmentResult{ 4301 Account: accName, 4302 Stream: stream, 4303 Consumer: consumer, 4304 Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, 4305 } 4306 result.Response.Error = NewJSConsumerNameExistError() 4307 s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result) 4308 } 4309 s.Warnf("Consumer create failed during update for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err) 4310 js.mu.RUnlock() 4311 return 4312 } 4313 } 4314 4315 var sendState bool 4316 js.mu.RLock() 4317 n := rg.node 4318 // Check if we already had a consumer assignment and its still pending. 4319 cca, oca := ca, o.consumerAssignment() 4320 if oca != nil { 4321 if !oca.responded { 4322 // We can't override info for replying here otherwise leader once elected can not respond. 4323 // So copy over original client and the reply from the old ca. 4324 cac := *ca 4325 cac.Client = oca.Client 4326 cac.Reply = oca.Reply 4327 cca = &cac 4328 needsLocalResponse = true 4329 } 4330 // If we look like we are scaling up, let's send our current state to the group. 4331 sendState = len(ca.Group.Peers) > len(oca.Group.Peers) && o.IsLeader() && n != nil 4332 // Signal that this is an update 4333 if ca.Reply != _EMPTY_ { 4334 isConfigUpdate = true 4335 } 4336 } 4337 js.mu.RUnlock() 4338 4339 if sendState { 4340 if snap, err := o.store.EncodedState(); err == nil { 4341 n.SendSnapshot(snap) 4342 } 4343 } 4344 4345 // Set CA for our consumer. 4346 o.setConsumerAssignment(cca) 4347 s.Debugf("JetStream cluster, consumer '%s > %s > %s' was already running", ca.Client.serviceAccount(), ca.Stream, ca.Name) 4348 } 4349 4350 // If we have an initial state set apply that now. 4351 if state != nil && o != nil { 4352 o.mu.Lock() 4353 err = o.setStoreState(state) 4354 o.mu.Unlock() 4355 } 4356 4357 if err != nil { 4358 if IsNatsErr(err, JSConsumerStoreFailedErrF) { 4359 s.Warnf("Consumer create failed for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err) 4360 err = errConsumerStoreFailed 4361 } 4362 4363 js.mu.Lock() 4364 4365 ca.err = err 4366 hasResponded := ca.responded 4367 4368 // If out of space do nothing for now. 4369 if isOutOfSpaceErr(err) { 4370 hasResponded = true 4371 } 4372 4373 if rg.node != nil { 4374 rg.node.Delete() 4375 // Clear the node here. 4376 rg.node = nil 4377 } 4378 4379 // If we did seem to create a consumer make sure to stop it. 4380 if o != nil { 4381 o.stop() 4382 } 4383 4384 var result *consumerAssignmentResult 4385 if !hasResponded && !js.metaRecovering { 4386 result = &consumerAssignmentResult{ 4387 Account: ca.Client.serviceAccount(), 4388 Stream: ca.Stream, 4389 Consumer: ca.Name, 4390 Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}, 4391 } 4392 result.Response.Error = NewJSConsumerCreateError(err, Unless(err)) 4393 } else if err == errNoInterest { 4394 // This is a stranded ephemeral, let's clean this one up. 4395 subject := fmt.Sprintf(JSApiConsumerDeleteT, ca.Stream, ca.Name) 4396 mset.outq.send(newJSPubMsg(subject, _EMPTY_, _EMPTY_, nil, nil, nil, 0)) 4397 } 4398 js.mu.Unlock() 4399 4400 if result != nil { 4401 // Send response to the metadata leader. They will forward to the user as needed. 4402 b, _ := json.Marshal(result) // Avoids auto-processing and doing fancy json with newlines. 4403 s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, b) 4404 } 4405 } else { 4406 if didCreate { 4407 o.setCreatedTime(ca.Created) 4408 } else { 4409 // Check for scale down to 1.. 4410 if rg.node != nil && len(rg.Peers) == 1 { 4411 o.clearNode() 4412 o.setLeader(true) 4413 // Need to clear from rg too. 4414 js.mu.Lock() 4415 rg.node = nil 4416 client, subject, reply := ca.Client, ca.Subject, ca.Reply 4417 js.mu.Unlock() 4418 var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} 4419 resp.ConsumerInfo = o.info() 4420 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 4421 return 4422 } 4423 } 4424 4425 if rg.node == nil { 4426 // Single replica consumer, process manually here. 4427 js.mu.Lock() 4428 // Force response in case we think this is an update. 4429 if !js.metaRecovering && isConfigUpdate { 4430 ca.responded = false 4431 } 4432 js.mu.Unlock() 4433 js.processConsumerLeaderChange(o, true) 4434 } else { 4435 // Clustered consumer. 4436 // Start our monitoring routine if needed. 4437 if !alreadyRunning && o.shouldStartMonitor() { 4438 s.startGoRoutine( 4439 func() { js.monitorConsumer(o, ca) }, 4440 pprofLabels{ 4441 "type": "consumer", 4442 "account": mset.accName(), 4443 "stream": mset.name(), 4444 "consumer": ca.Name, 4445 }, 4446 ) 4447 } 4448 // For existing consumer, only send response if not recovering. 4449 if wasExisting && !js.isMetaRecovering() { 4450 if o.IsLeader() || (!didCreate && needsLocalResponse) { 4451 // Process if existing as an update. Double check that this is not recovered. 4452 js.mu.RLock() 4453 client, subject, reply, recovering := ca.Client, ca.Subject, ca.Reply, ca.recovering 4454 js.mu.RUnlock() 4455 if !recovering { 4456 var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} 4457 resp.ConsumerInfo = o.info() 4458 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 4459 } 4460 } 4461 } 4462 } 4463 } 4464 } 4465 4466 func (js *jetStream) processClusterDeleteConsumer(ca *consumerAssignment, wasLeader bool) { 4467 if ca == nil { 4468 return 4469 } 4470 js.mu.RLock() 4471 s := js.srv 4472 node := ca.Group.node 4473 offline := s.allPeersOffline(ca.Group) 4474 var isMetaLeader bool 4475 if cc := js.cluster; cc != nil { 4476 isMetaLeader = cc.isLeader() 4477 } 4478 recovering := ca.recovering 4479 js.mu.RUnlock() 4480 4481 var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}} 4482 var err error 4483 var acc *Account 4484 4485 // Go ahead and delete the consumer if we have it and the account. 4486 if acc, _ = s.LookupAccount(ca.Client.serviceAccount()); acc != nil { 4487 if mset, _ := acc.lookupStream(ca.Stream); mset != nil { 4488 if o := mset.lookupConsumer(ca.Name); o != nil { 4489 err = o.stopWithFlags(true, false, true, wasLeader) 4490 } 4491 } 4492 } else if ca.Group != nil { 4493 // We have a missing account, see if we can cleanup. 4494 if sacc := s.SystemAccount(); sacc != nil { 4495 os.RemoveAll(filepath.Join(js.config.StoreDir, sacc.GetName(), defaultStoreDirName, ca.Group.Name)) 4496 } 4497 } 4498 4499 // Always delete the node if present. 4500 if node != nil { 4501 node.Delete() 4502 } 4503 4504 if !wasLeader || ca.Reply == _EMPTY_ { 4505 if !(offline && isMetaLeader) { 4506 return 4507 } 4508 } 4509 4510 // Do not respond if the account does not exist any longer or this is during recovery. 4511 if acc == nil || recovering { 4512 return 4513 } 4514 4515 if err != nil { 4516 resp.Error = NewJSStreamNotFoundError(Unless(err)) 4517 s.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp)) 4518 } else { 4519 resp.Success = true 4520 s.sendAPIResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp)) 4521 } 4522 } 4523 4524 // Returns the consumer assignment, or nil if not present. 4525 // Lock should be held. 4526 func (js *jetStream) consumerAssignment(account, stream, consumer string) *consumerAssignment { 4527 if sa := js.streamAssignment(account, stream); sa != nil { 4528 return sa.consumers[consumer] 4529 } 4530 return nil 4531 } 4532 4533 // consumerAssigned informs us if this server has this consumer assigned. 4534 func (jsa *jsAccount) consumerAssigned(stream, consumer string) bool { 4535 jsa.mu.RLock() 4536 js, acc := jsa.js, jsa.account 4537 jsa.mu.RUnlock() 4538 4539 if js == nil { 4540 return false 4541 } 4542 js.mu.RLock() 4543 defer js.mu.RUnlock() 4544 return js.cluster.isConsumerAssigned(acc, stream, consumer) 4545 } 4546 4547 // Read lock should be held. 4548 func (cc *jetStreamCluster) isConsumerAssigned(a *Account, stream, consumer string) bool { 4549 // Non-clustered mode always return true. 4550 if cc == nil { 4551 return true 4552 } 4553 if cc.meta == nil { 4554 return false 4555 } 4556 var sa *streamAssignment 4557 accStreams := cc.streams[a.Name] 4558 if accStreams != nil { 4559 sa = accStreams[stream] 4560 } 4561 if sa == nil { 4562 // TODO(dlc) - This should not happen. 4563 return false 4564 } 4565 ca := sa.consumers[consumer] 4566 if ca == nil { 4567 return false 4568 } 4569 rg := ca.Group 4570 // Check if we are the leader of this raftGroup assigned to the stream. 4571 ourID := cc.meta.ID() 4572 for _, peer := range rg.Peers { 4573 if peer == ourID { 4574 return true 4575 } 4576 } 4577 return false 4578 } 4579 4580 // Returns our stream and underlying raft node. 4581 func (o *consumer) streamAndNode() (*stream, RaftNode) { 4582 if o == nil { 4583 return nil, nil 4584 } 4585 o.mu.RLock() 4586 defer o.mu.RUnlock() 4587 return o.mset, o.node 4588 } 4589 4590 // Return the replica count for this consumer. If the consumer has been 4591 // stopped, this will return an error. 4592 func (o *consumer) replica() (int, error) { 4593 o.mu.RLock() 4594 oCfg := o.cfg 4595 mset := o.mset 4596 o.mu.RUnlock() 4597 if mset == nil { 4598 return 0, errBadConsumer 4599 } 4600 sCfg := mset.config() 4601 return oCfg.replicas(&sCfg), nil 4602 } 4603 4604 func (o *consumer) raftGroup() *raftGroup { 4605 if o == nil { 4606 return nil 4607 } 4608 o.mu.RLock() 4609 defer o.mu.RUnlock() 4610 if o.ca == nil { 4611 return nil 4612 } 4613 return o.ca.Group 4614 } 4615 4616 func (o *consumer) clearRaftNode() { 4617 if o == nil { 4618 return 4619 } 4620 o.mu.Lock() 4621 defer o.mu.Unlock() 4622 o.node = nil 4623 } 4624 4625 func (o *consumer) raftNode() RaftNode { 4626 if o == nil { 4627 return nil 4628 } 4629 o.mu.RLock() 4630 defer o.mu.RUnlock() 4631 return o.node 4632 } 4633 4634 func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { 4635 s, n, cc := js.server(), o.raftNode(), js.cluster 4636 defer s.grWG.Done() 4637 4638 defer o.clearMonitorRunning() 4639 4640 if n == nil { 4641 s.Warnf("No RAFT group for '%s > %s > %s'", o.acc.Name, ca.Stream, ca.Name) 4642 return 4643 } 4644 4645 // Make sure to stop the raft group on exit to prevent accidental memory bloat. 4646 // This should be below the checkInMonitor call though to avoid stopping it out 4647 // from underneath the one that is running since it will be the same raft node. 4648 defer n.Stop() 4649 4650 qch, lch, aq, uch, ourPeerId := n.QuitC(), n.LeadChangeC(), n.ApplyQ(), o.updateC(), cc.meta.ID() 4651 4652 s.Debugf("Starting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group()) 4653 defer s.Debugf("Exiting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group()) 4654 4655 const ( 4656 compactInterval = 2 * time.Minute 4657 compactSizeMin = 64 * 1024 // What is stored here is always small for consumers. 4658 compactNumMin = 1024 4659 minSnapDelta = 10 * time.Second 4660 ) 4661 4662 // Spread these out for large numbers on server restart. 4663 rci := time.Duration(rand.Int63n(int64(time.Minute))) 4664 t := time.NewTicker(compactInterval + rci) 4665 defer t.Stop() 4666 4667 // Highwayhash key for generating hashes. 4668 key := make([]byte, 32) 4669 crand.Read(key) 4670 4671 // Hash of the last snapshot (fixed size in memory). 4672 var lastSnap []byte 4673 var lastSnapTime time.Time 4674 4675 // Don't allow the upper layer to install snapshots until we have 4676 // fully recovered from disk. 4677 recovering := true 4678 4679 doSnapshot := func(force bool) { 4680 // Bail if trying too fast and not in a forced situation. 4681 if recovering || (!force && time.Since(lastSnapTime) < minSnapDelta) { 4682 return 4683 } 4684 4685 // Check several things to see if we need a snapshot. 4686 ne, nb := n.Size() 4687 if !n.NeedSnapshot() { 4688 // Check if we should compact etc. based on size of log. 4689 if !force && ne < compactNumMin && nb < compactSizeMin { 4690 return 4691 } 4692 } 4693 4694 if snap, err := o.store.EncodedState(); err == nil { 4695 hash := highwayhash.Sum(snap, key) 4696 // If the state hasn't changed but the log has gone way over 4697 // the compaction size then we will want to compact anyway. 4698 // This can happen for example when a pull consumer fetches a 4699 // lot on an idle stream, log entries get distributed but the 4700 // state never changes, therefore the log never gets compacted. 4701 if !bytes.Equal(hash[:], lastSnap) || ne >= compactNumMin || nb >= compactSizeMin { 4702 if err := n.InstallSnapshot(snap); err == nil { 4703 lastSnap, lastSnapTime = hash[:], time.Now() 4704 } else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning { 4705 s.RateLimitWarnf("Failed to install snapshot for '%s > %s > %s' [%s]: %v", o.acc.Name, ca.Stream, ca.Name, n.Group(), err) 4706 } 4707 } 4708 } 4709 } 4710 4711 // For migration tracking. 4712 var mmt *time.Ticker 4713 var mmtc <-chan time.Time 4714 4715 startMigrationMonitoring := func() { 4716 if mmt == nil { 4717 mmt = time.NewTicker(500 * time.Millisecond) 4718 mmtc = mmt.C 4719 } 4720 } 4721 4722 stopMigrationMonitoring := func() { 4723 if mmt != nil { 4724 mmt.Stop() 4725 mmt, mmtc = nil, nil 4726 } 4727 } 4728 defer stopMigrationMonitoring() 4729 4730 // Track if we are leader. 4731 var isLeader bool 4732 4733 for { 4734 select { 4735 case <-s.quitCh: 4736 return 4737 case <-qch: 4738 return 4739 case <-aq.ch: 4740 ces := aq.pop() 4741 for _, ce := range ces { 4742 // No special processing needed for when we are caught up on restart. 4743 if ce == nil { 4744 recovering = false 4745 if n.NeedSnapshot() { 4746 doSnapshot(true) 4747 } 4748 } else if err := js.applyConsumerEntries(o, ce, isLeader); err == nil { 4749 ne, nb := n.Applied(ce.Index) 4750 ce.ReturnToPool() 4751 // If we have at least min entries to compact, go ahead and snapshot/compact. 4752 if nb > 0 && ne >= compactNumMin || nb > compactSizeMin { 4753 doSnapshot(false) 4754 } 4755 } else if err != errConsumerClosed { 4756 s.Warnf("Error applying consumer entries to '%s > %s'", ca.Client.serviceAccount(), ca.Name) 4757 } 4758 } 4759 aq.recycle(&ces) 4760 case isLeader = <-lch: 4761 if recovering && !isLeader { 4762 js.setConsumerAssignmentRecovering(ca) 4763 } 4764 4765 // Process the change. 4766 if err := js.processConsumerLeaderChange(o, isLeader); err == nil && isLeader { 4767 // Check our state if we are under an interest based stream. 4768 o.checkStateForInterestStream() 4769 // Do a snapshot. 4770 doSnapshot(true) 4771 // Synchronize followers to our state. Only send out if we have state. 4772 if n != nil { 4773 if _, _, applied := n.Progress(); applied > 0 { 4774 if snap, err := o.store.EncodedState(); err == nil { 4775 n.SendSnapshot(snap) 4776 } 4777 } 4778 } 4779 } 4780 4781 // We may receive a leader change after the consumer assignment which would cancel us 4782 // monitoring for this closely. So re-assess our state here as well. 4783 // Or the old leader is no longer part of the set and transferred leadership 4784 // for this leader to resume with removal 4785 rg := o.raftGroup() 4786 4787 // Check for migrations (peer count and replica count differ) here. 4788 // We set the state on the stream assignment update below. 4789 replicas, err := o.replica() 4790 if err != nil { 4791 continue 4792 } 4793 if isLeader && len(rg.Peers) != replicas { 4794 startMigrationMonitoring() 4795 } else { 4796 stopMigrationMonitoring() 4797 } 4798 case <-uch: 4799 // keep consumer assignment current 4800 ca = o.consumerAssignment() 4801 // We get this when we have a new consumer assignment caused by an update. 4802 // We want to know if we are migrating. 4803 rg := o.raftGroup() 4804 // keep peer list up to date with config 4805 js.checkPeers(rg) 4806 // If we are migrating, monitor for the new peers to be caught up. 4807 replicas, err := o.replica() 4808 if err != nil { 4809 continue 4810 } 4811 if isLeader && len(rg.Peers) != replicas { 4812 startMigrationMonitoring() 4813 } else { 4814 stopMigrationMonitoring() 4815 } 4816 case <-mmtc: 4817 if !isLeader { 4818 // We are no longer leader, so not our job. 4819 stopMigrationMonitoring() 4820 continue 4821 } 4822 rg := o.raftGroup() 4823 ci := js.clusterInfo(rg) 4824 replicas, err := o.replica() 4825 if err != nil { 4826 continue 4827 } 4828 if len(rg.Peers) <= replicas { 4829 // Migration no longer happening, so not our job anymore 4830 stopMigrationMonitoring() 4831 continue 4832 } 4833 newPeers, oldPeers, newPeerSet, _ := genPeerInfo(rg.Peers, len(rg.Peers)-replicas) 4834 4835 // If we are part of the new peerset and we have been passed the baton. 4836 // We will handle scale down. 4837 if newPeerSet[ourPeerId] { 4838 for _, p := range oldPeers { 4839 n.ProposeRemovePeer(p) 4840 } 4841 cca := ca.copyGroup() 4842 cca.Group.Peers = newPeers 4843 cca.Group.Cluster = s.cachedClusterName() 4844 cc.meta.ForwardProposal(encodeAddConsumerAssignment(cca)) 4845 s.Noticef("Scaling down '%s > %s > %s' to %+v", ca.Client.serviceAccount(), ca.Stream, ca.Name, s.peerSetToNames(newPeers)) 4846 4847 } else { 4848 var newLeaderPeer, newLeader, newCluster string 4849 neededCurrent, current := replicas/2+1, 0 4850 for _, r := range ci.Replicas { 4851 if r.Current && newPeerSet[r.Peer] { 4852 current++ 4853 if newCluster == _EMPTY_ { 4854 newLeaderPeer, newLeader, newCluster = r.Peer, r.Name, r.cluster 4855 } 4856 } 4857 } 4858 4859 // Check if we have a quorom 4860 if current >= neededCurrent { 4861 s.Noticef("Transfer of consumer leader for '%s > %s > %s' to '%s'", ca.Client.serviceAccount(), ca.Stream, ca.Name, newLeader) 4862 n.StepDown(newLeaderPeer) 4863 } 4864 } 4865 4866 case <-t.C: 4867 doSnapshot(false) 4868 } 4869 } 4870 } 4871 4872 func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLeader bool) error { 4873 for _, e := range ce.Entries { 4874 if e.Type == EntrySnapshot { 4875 if !isLeader { 4876 // No-op needed? 4877 state, err := decodeConsumerState(e.Data) 4878 if err != nil { 4879 if mset, node := o.streamAndNode(); mset != nil && node != nil { 4880 s := js.srv 4881 s.Errorf("JetStream cluster could not decode consumer snapshot for '%s > %s > %s' [%s]", 4882 mset.account(), mset.name(), o, node.Group()) 4883 } 4884 panic(err.Error()) 4885 } 4886 if err = o.store.Update(state); err != nil { 4887 o.mu.RLock() 4888 s, acc, mset, name := o.srv, o.acc, o.mset, o.name 4889 o.mu.RUnlock() 4890 if s != nil && mset != nil { 4891 s.Warnf("Consumer '%s > %s > %s' error on store update from snapshot entry: %v", acc, mset.name(), name, err) 4892 } 4893 } 4894 // Check our interest state if applicable. 4895 o.checkStateForInterestStream() 4896 } 4897 4898 } else if e.Type == EntryRemovePeer { 4899 js.mu.RLock() 4900 var ourID string 4901 if js.cluster != nil && js.cluster.meta != nil { 4902 ourID = js.cluster.meta.ID() 4903 } 4904 js.mu.RUnlock() 4905 if peer := string(e.Data); peer == ourID { 4906 shouldRemove := true 4907 if mset := o.getStream(); mset != nil { 4908 if sa := mset.streamAssignment(); sa != nil && sa.Group != nil { 4909 js.mu.RLock() 4910 shouldRemove = !sa.Group.isMember(ourID) 4911 js.mu.RUnlock() 4912 } 4913 } 4914 if shouldRemove { 4915 o.stopWithFlags(true, false, false, false) 4916 } 4917 } 4918 return nil 4919 } else if e.Type == EntryAddPeer { 4920 // Ignore for now. 4921 } else { 4922 buf := e.Data 4923 switch entryOp(buf[0]) { 4924 case updateDeliveredOp: 4925 // These are handled in place in leaders. 4926 if !isLeader { 4927 dseq, sseq, dc, ts, err := decodeDeliveredUpdate(buf[1:]) 4928 if err != nil { 4929 if mset, node := o.streamAndNode(); mset != nil && node != nil { 4930 s := js.srv 4931 s.Errorf("JetStream cluster could not decode consumer delivered update for '%s > %s > %s' [%s]", 4932 mset.account(), mset.name(), o, node.Group()) 4933 } 4934 panic(err.Error()) 4935 } 4936 // Make sure to update delivered under the lock. 4937 o.mu.Lock() 4938 err = o.store.UpdateDelivered(dseq, sseq, dc, ts) 4939 o.ldt = time.Now() 4940 o.mu.Unlock() 4941 if err != nil { 4942 panic(err.Error()) 4943 } 4944 } 4945 case updateAcksOp: 4946 dseq, sseq, err := decodeAckUpdate(buf[1:]) 4947 if err != nil { 4948 if mset, node := o.streamAndNode(); mset != nil && node != nil { 4949 s := js.srv 4950 s.Errorf("JetStream cluster could not decode consumer ack update for '%s > %s > %s' [%s]", 4951 mset.account(), mset.name(), o, node.Group()) 4952 } 4953 panic(err.Error()) 4954 } 4955 if err := o.processReplicatedAck(dseq, sseq); err == errConsumerClosed { 4956 return err 4957 } 4958 case updateSkipOp: 4959 o.mu.Lock() 4960 if !o.isLeader() { 4961 var le = binary.LittleEndian 4962 if sseq := le.Uint64(buf[1:]); sseq > o.sseq { 4963 o.sseq = sseq 4964 } 4965 } 4966 o.mu.Unlock() 4967 case addPendingRequest: 4968 o.mu.Lock() 4969 if !o.isLeader() { 4970 if o.prm == nil { 4971 o.prm = make(map[string]struct{}) 4972 } 4973 o.prm[string(buf[1:])] = struct{}{} 4974 } 4975 o.mu.Unlock() 4976 case removePendingRequest: 4977 o.mu.Lock() 4978 if !o.isLeader() { 4979 if o.prm != nil { 4980 delete(o.prm, string(buf[1:])) 4981 } 4982 } 4983 o.mu.Unlock() 4984 default: 4985 panic(fmt.Sprintf("JetStream Cluster Unknown group entry op type: %v", entryOp(buf[0]))) 4986 } 4987 } 4988 } 4989 return nil 4990 } 4991 4992 var errConsumerClosed = errors.New("consumer closed") 4993 4994 func (o *consumer) processReplicatedAck(dseq, sseq uint64) error { 4995 o.mu.Lock() 4996 mset := o.mset 4997 if o.closed || mset == nil { 4998 o.mu.Unlock() 4999 return errConsumerClosed 5000 } 5001 if mset.closed.Load() { 5002 o.mu.Unlock() 5003 return errStreamClosed 5004 } 5005 5006 // Update activity. 5007 o.lat = time.Now() 5008 5009 // Do actual ack update to store. 5010 o.store.UpdateAcks(dseq, sseq) 5011 5012 if o.retention == LimitsPolicy { 5013 o.mu.Unlock() 5014 return nil 5015 } 5016 5017 var sagap uint64 5018 if o.cfg.AckPolicy == AckAll { 5019 if o.isLeader() { 5020 sagap = sseq - o.asflr 5021 } else { 5022 // We are a follower so only have the store state, so read that in. 5023 state, err := o.store.State() 5024 if err != nil { 5025 o.mu.Unlock() 5026 return err 5027 } 5028 sagap = sseq - state.AckFloor.Stream 5029 } 5030 } 5031 o.mu.Unlock() 5032 5033 if sagap > 1 { 5034 // FIXME(dlc) - This is very inefficient, will need to fix. 5035 for seq := sseq; seq > sseq-sagap; seq-- { 5036 mset.ackMsg(o, seq) 5037 } 5038 } else { 5039 mset.ackMsg(o, sseq) 5040 } 5041 return nil 5042 } 5043 5044 var errBadAckUpdate = errors.New("jetstream cluster bad replicated ack update") 5045 var errBadDeliveredUpdate = errors.New("jetstream cluster bad replicated delivered update") 5046 5047 func decodeAckUpdate(buf []byte) (dseq, sseq uint64, err error) { 5048 var bi, n int 5049 if dseq, n = binary.Uvarint(buf); n < 0 { 5050 return 0, 0, errBadAckUpdate 5051 } 5052 bi += n 5053 if sseq, n = binary.Uvarint(buf[bi:]); n < 0 { 5054 return 0, 0, errBadAckUpdate 5055 } 5056 return dseq, sseq, nil 5057 } 5058 5059 func decodeDeliveredUpdate(buf []byte) (dseq, sseq, dc uint64, ts int64, err error) { 5060 var bi, n int 5061 if dseq, n = binary.Uvarint(buf); n < 0 { 5062 return 0, 0, 0, 0, errBadDeliveredUpdate 5063 } 5064 bi += n 5065 if sseq, n = binary.Uvarint(buf[bi:]); n < 0 { 5066 return 0, 0, 0, 0, errBadDeliveredUpdate 5067 } 5068 bi += n 5069 if dc, n = binary.Uvarint(buf[bi:]); n < 0 { 5070 return 0, 0, 0, 0, errBadDeliveredUpdate 5071 } 5072 bi += n 5073 if ts, n = binary.Varint(buf[bi:]); n < 0 { 5074 return 0, 0, 0, 0, errBadDeliveredUpdate 5075 } 5076 return dseq, sseq, dc, ts, nil 5077 } 5078 5079 func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) error { 5080 stepDownIfLeader := func() error { 5081 if node := o.raftNode(); node != nil && isLeader { 5082 node.StepDown() 5083 } 5084 return errors.New("failed to update consumer leader status") 5085 } 5086 5087 if o == nil || o.isClosed() { 5088 return stepDownIfLeader() 5089 } 5090 5091 ca := o.consumerAssignment() 5092 if ca == nil { 5093 return stepDownIfLeader() 5094 } 5095 js.mu.Lock() 5096 s, account, err := js.srv, ca.Client.serviceAccount(), ca.err 5097 client, subject, reply, streamName, consumerName := ca.Client, ca.Subject, ca.Reply, ca.Stream, ca.Name 5098 hasResponded := ca.responded 5099 ca.responded = true 5100 js.mu.Unlock() 5101 5102 acc, _ := s.LookupAccount(account) 5103 if acc == nil { 5104 return stepDownIfLeader() 5105 } 5106 5107 if isLeader { 5108 s.Noticef("JetStream cluster new consumer leader for '%s > %s > %s'", ca.Client.serviceAccount(), streamName, consumerName) 5109 s.sendConsumerLeaderElectAdvisory(o) 5110 // Check for peer removal and process here if needed. 5111 js.checkPeers(ca.Group) 5112 } else { 5113 // We are stepping down. 5114 // Make sure if we are doing so because we have lost quorum that we send the appropriate advisories. 5115 if node := o.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second { 5116 s.sendConsumerLostQuorumAdvisory(o) 5117 } 5118 } 5119 5120 // Tell consumer to switch leader status. 5121 o.setLeader(isLeader) 5122 5123 if !isLeader || hasResponded { 5124 if isLeader { 5125 o.clearInitialInfo() 5126 } 5127 return nil 5128 } 5129 5130 var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} 5131 if err != nil { 5132 resp.Error = NewJSConsumerCreateError(err, Unless(err)) 5133 s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 5134 } else { 5135 resp.ConsumerInfo = o.initialInfo() 5136 s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) 5137 if node := o.raftNode(); node != nil { 5138 o.sendCreateAdvisory() 5139 } 5140 } 5141 5142 // Only send a pause advisory on consumer create if we're 5143 // actually paused. The timer would have been kicked by now 5144 // by the call to o.setLeader() above. 5145 if isLeader && o.cfg.PauseUntil != nil && !o.cfg.PauseUntil.IsZero() && time.Now().Before(*o.cfg.PauseUntil) { 5146 o.sendPauseAdvisoryLocked(&o.cfg) 5147 } 5148 5149 return nil 5150 } 5151 5152 // Determines if we should send lost quorum advisory. We throttle these after first one. 5153 func (o *consumer) shouldSendLostQuorum() bool { 5154 o.mu.Lock() 5155 defer o.mu.Unlock() 5156 if time.Since(o.lqsent) >= lostQuorumAdvInterval { 5157 o.lqsent = time.Now() 5158 return true 5159 } 5160 return false 5161 } 5162 5163 func (s *Server) sendConsumerLostQuorumAdvisory(o *consumer) { 5164 if o == nil { 5165 return 5166 } 5167 node, stream, consumer, acc := o.raftNode(), o.streamName(), o.String(), o.account() 5168 if node == nil { 5169 return 5170 } 5171 if !o.shouldSendLostQuorum() { 5172 return 5173 } 5174 5175 s.Warnf("JetStream cluster consumer '%s > %s > %s' has NO quorum, stalled.", acc.GetName(), stream, consumer) 5176 5177 subj := JSAdvisoryConsumerQuorumLostPre + "." + stream + "." + consumer 5178 adv := &JSConsumerQuorumLostAdvisory{ 5179 TypedEvent: TypedEvent{ 5180 Type: JSConsumerQuorumLostAdvisoryType, 5181 ID: nuid.Next(), 5182 Time: time.Now().UTC(), 5183 }, 5184 Stream: stream, 5185 Consumer: consumer, 5186 Replicas: s.replicas(node), 5187 Domain: s.getOpts().JetStreamDomain, 5188 } 5189 5190 // Send to the user's account if not the system account. 5191 if acc != s.SystemAccount() { 5192 s.publishAdvisory(acc, subj, adv) 5193 } 5194 // Now do system level one. Place account info in adv, and nil account means system. 5195 adv.Account = acc.GetName() 5196 s.publishAdvisory(nil, subj, adv) 5197 } 5198 5199 func (s *Server) sendConsumerLeaderElectAdvisory(o *consumer) { 5200 if o == nil { 5201 return 5202 } 5203 node, stream, consumer, acc := o.raftNode(), o.streamName(), o.String(), o.account() 5204 if node == nil { 5205 return 5206 } 5207 5208 subj := JSAdvisoryConsumerLeaderElectedPre + "." + stream + "." + consumer 5209 adv := &JSConsumerLeaderElectedAdvisory{ 5210 TypedEvent: TypedEvent{ 5211 Type: JSConsumerLeaderElectedAdvisoryType, 5212 ID: nuid.Next(), 5213 Time: time.Now().UTC(), 5214 }, 5215 Stream: stream, 5216 Consumer: consumer, 5217 Leader: s.serverNameForNode(node.GroupLeader()), 5218 Replicas: s.replicas(node), 5219 Domain: s.getOpts().JetStreamDomain, 5220 } 5221 5222 // Send to the user's account if not the system account. 5223 if acc != s.SystemAccount() { 5224 s.publishAdvisory(acc, subj, adv) 5225 } 5226 // Now do system level one. Place account info in adv, and nil account means system. 5227 adv.Account = acc.GetName() 5228 s.publishAdvisory(nil, subj, adv) 5229 } 5230 5231 type streamAssignmentResult struct { 5232 Account string `json:"account"` 5233 Stream string `json:"stream"` 5234 Response *JSApiStreamCreateResponse `json:"create_response,omitempty"` 5235 Restore *JSApiStreamRestoreResponse `json:"restore_response,omitempty"` 5236 Update bool `json:"is_update,omitempty"` 5237 } 5238 5239 // Determine if this is an insufficient resources' error type. 5240 func isInsufficientResourcesErr(resp *JSApiStreamCreateResponse) bool { 5241 return resp != nil && resp.Error != nil && IsNatsErr(resp.Error, JSInsufficientResourcesErr, JSMemoryResourcesExceededErr, JSStorageResourcesExceededErr) 5242 } 5243 5244 // Process error results of stream and consumer assignments. 5245 // Success will be handled by stream leader. 5246 func (js *jetStream) processStreamAssignmentResults(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 5247 var result streamAssignmentResult 5248 if err := json.Unmarshal(msg, &result); err != nil { 5249 // TODO(dlc) - log 5250 return 5251 } 5252 acc, _ := js.srv.LookupAccount(result.Account) 5253 if acc == nil { 5254 // TODO(dlc) - log 5255 return 5256 } 5257 5258 js.mu.Lock() 5259 defer js.mu.Unlock() 5260 5261 s, cc := js.srv, js.cluster 5262 if cc == nil || cc.meta == nil { 5263 return 5264 } 5265 5266 // This should have been done already in processStreamAssignment, but in 5267 // case we have a code path that gets here with no processStreamAssignment, 5268 // then we will do the proper thing. Otherwise will be a no-op. 5269 cc.removeInflightProposal(result.Account, result.Stream) 5270 5271 // FIXME(dlc) - suppress duplicates? 5272 if sa := js.streamAssignment(result.Account, result.Stream); sa != nil { 5273 canDelete := !result.Update && time.Since(sa.Created) < 5*time.Second 5274 5275 // See if we should retry in case this cluster is full but there are others. 5276 if cfg, ci := sa.Config, sa.Client; cfg != nil && ci != nil && isInsufficientResourcesErr(result.Response) && canDelete { 5277 // If cluster is defined we can not retry. 5278 if cfg.Placement == nil || cfg.Placement.Cluster == _EMPTY_ { 5279 // If we have additional clusters to try we can retry. 5280 // We have already verified that ci != nil. 5281 if len(ci.Alternates) > 0 { 5282 if rg, err := js.createGroupForStream(ci, cfg); err != nil { 5283 s.Warnf("Retrying cluster placement for stream '%s > %s' failed due to placement error: %+v", result.Account, result.Stream, err) 5284 } else { 5285 if org := sa.Group; org != nil && len(org.Peers) > 0 { 5286 s.Warnf("Retrying cluster placement for stream '%s > %s' due to insufficient resources in cluster %q", 5287 result.Account, result.Stream, s.clusterNameForNode(org.Peers[0])) 5288 } else { 5289 s.Warnf("Retrying cluster placement for stream '%s > %s' due to insufficient resources", result.Account, result.Stream) 5290 } 5291 // Pick a new preferred leader. 5292 rg.setPreferred() 5293 // Get rid of previous attempt. 5294 cc.meta.Propose(encodeDeleteStreamAssignment(sa)) 5295 // Propose new. 5296 sa.Group, sa.err = rg, nil 5297 cc.meta.Propose(encodeAddStreamAssignment(sa)) 5298 return 5299 } 5300 } 5301 } 5302 } 5303 5304 // Respond to the user here. 5305 var resp string 5306 if result.Response != nil { 5307 resp = s.jsonResponse(result.Response) 5308 } else if result.Restore != nil { 5309 resp = s.jsonResponse(result.Restore) 5310 } 5311 if !sa.responded || result.Update { 5312 sa.responded = true 5313 js.srv.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, resp) 5314 } 5315 // Remove this assignment if possible. 5316 if canDelete { 5317 sa.err = NewJSClusterNotAssignedError() 5318 cc.meta.Propose(encodeDeleteStreamAssignment(sa)) 5319 } 5320 } 5321 } 5322 5323 func (js *jetStream) processConsumerAssignmentResults(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 5324 var result consumerAssignmentResult 5325 if err := json.Unmarshal(msg, &result); err != nil { 5326 // TODO(dlc) - log 5327 return 5328 } 5329 acc, _ := js.srv.LookupAccount(result.Account) 5330 if acc == nil { 5331 // TODO(dlc) - log 5332 return 5333 } 5334 5335 js.mu.Lock() 5336 defer js.mu.Unlock() 5337 5338 s, cc := js.srv, js.cluster 5339 if cc == nil || cc.meta == nil { 5340 return 5341 } 5342 5343 if sa := js.streamAssignment(result.Account, result.Stream); sa != nil && sa.consumers != nil { 5344 if ca := sa.consumers[result.Consumer]; ca != nil && !ca.responded { 5345 js.srv.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(result.Response)) 5346 ca.responded = true 5347 5348 // Check if this failed. 5349 // TODO(dlc) - Could have mixed results, should track per peer. 5350 // Make sure this is recent response, do not delete existing consumers. 5351 if result.Response.Error != nil && result.Response.Error != NewJSConsumerNameExistError() && time.Since(ca.Created) < 2*time.Second { 5352 // So while we are deleting we will not respond to list/names requests. 5353 ca.err = NewJSClusterNotAssignedError() 5354 cc.meta.Propose(encodeDeleteConsumerAssignment(ca)) 5355 s.Warnf("Proposing to delete consumer `%s > %s > %s' due to assignment response error: %v", 5356 result.Account, result.Stream, result.Consumer, result.Response.Error) 5357 } 5358 } 5359 } 5360 } 5361 5362 const ( 5363 streamAssignmentSubj = "$SYS.JSC.STREAM.ASSIGNMENT.RESULT" 5364 consumerAssignmentSubj = "$SYS.JSC.CONSUMER.ASSIGNMENT.RESULT" 5365 ) 5366 5367 // Lock should be held. 5368 func (js *jetStream) startUpdatesSub() { 5369 cc, s, c := js.cluster, js.srv, js.cluster.c 5370 if cc.streamResults == nil { 5371 cc.streamResults, _ = s.systemSubscribe(streamAssignmentSubj, _EMPTY_, false, c, js.processStreamAssignmentResults) 5372 } 5373 if cc.consumerResults == nil { 5374 cc.consumerResults, _ = s.systemSubscribe(consumerAssignmentSubj, _EMPTY_, false, c, js.processConsumerAssignmentResults) 5375 } 5376 if cc.stepdown == nil { 5377 cc.stepdown, _ = s.systemSubscribe(JSApiLeaderStepDown, _EMPTY_, false, c, s.jsLeaderStepDownRequest) 5378 } 5379 if cc.peerRemove == nil { 5380 cc.peerRemove, _ = s.systemSubscribe(JSApiRemoveServer, _EMPTY_, false, c, s.jsLeaderServerRemoveRequest) 5381 } 5382 if cc.peerStreamMove == nil { 5383 cc.peerStreamMove, _ = s.systemSubscribe(JSApiServerStreamMove, _EMPTY_, false, c, s.jsLeaderServerStreamMoveRequest) 5384 } 5385 if cc.peerStreamCancelMove == nil { 5386 cc.peerStreamCancelMove, _ = s.systemSubscribe(JSApiServerStreamCancelMove, _EMPTY_, false, c, s.jsLeaderServerStreamCancelMoveRequest) 5387 } 5388 if js.accountPurge == nil { 5389 js.accountPurge, _ = s.systemSubscribe(JSApiAccountPurge, _EMPTY_, false, c, s.jsLeaderAccountPurgeRequest) 5390 } 5391 } 5392 5393 // Lock should be held. 5394 func (js *jetStream) stopUpdatesSub() { 5395 cc := js.cluster 5396 if cc.streamResults != nil { 5397 cc.s.sysUnsubscribe(cc.streamResults) 5398 cc.streamResults = nil 5399 } 5400 if cc.consumerResults != nil { 5401 cc.s.sysUnsubscribe(cc.consumerResults) 5402 cc.consumerResults = nil 5403 } 5404 if cc.stepdown != nil { 5405 cc.s.sysUnsubscribe(cc.stepdown) 5406 cc.stepdown = nil 5407 } 5408 if cc.peerRemove != nil { 5409 cc.s.sysUnsubscribe(cc.peerRemove) 5410 cc.peerRemove = nil 5411 } 5412 if cc.peerStreamMove != nil { 5413 cc.s.sysUnsubscribe(cc.peerStreamMove) 5414 cc.peerStreamMove = nil 5415 } 5416 if cc.peerStreamCancelMove != nil { 5417 cc.s.sysUnsubscribe(cc.peerStreamCancelMove) 5418 cc.peerStreamCancelMove = nil 5419 } 5420 if js.accountPurge != nil { 5421 cc.s.sysUnsubscribe(js.accountPurge) 5422 js.accountPurge = nil 5423 } 5424 } 5425 5426 func (s *Server) sendDomainLeaderElectAdvisory() { 5427 js, cc := s.getJetStreamCluster() 5428 if js == nil || cc == nil { 5429 return 5430 } 5431 5432 js.mu.RLock() 5433 node := cc.meta 5434 js.mu.RUnlock() 5435 5436 adv := &JSDomainLeaderElectedAdvisory{ 5437 TypedEvent: TypedEvent{ 5438 Type: JSDomainLeaderElectedAdvisoryType, 5439 ID: nuid.Next(), 5440 Time: time.Now().UTC(), 5441 }, 5442 Leader: node.GroupLeader(), 5443 Replicas: s.replicas(node), 5444 Cluster: s.cachedClusterName(), 5445 Domain: s.getOpts().JetStreamDomain, 5446 } 5447 5448 s.publishAdvisory(nil, JSAdvisoryDomainLeaderElected, adv) 5449 } 5450 5451 func (js *jetStream) processLeaderChange(isLeader bool) { 5452 if js == nil { 5453 return 5454 } 5455 s := js.srv 5456 if s == nil { 5457 return 5458 } 5459 // Update our server atomic. 5460 s.isMetaLeader.Store(isLeader) 5461 5462 if isLeader { 5463 s.Noticef("Self is new JetStream cluster metadata leader") 5464 s.sendDomainLeaderElectAdvisory() 5465 } else { 5466 var node string 5467 if meta := js.getMetaGroup(); meta != nil { 5468 node = meta.GroupLeader() 5469 } 5470 if node == _EMPTY_ { 5471 s.Noticef("JetStream cluster no metadata leader") 5472 } else if srv := js.srv.serverNameForNode(node); srv == _EMPTY_ { 5473 s.Noticef("JetStream cluster new remote metadata leader") 5474 } else if clst := js.srv.clusterNameForNode(node); clst == _EMPTY_ { 5475 s.Noticef("JetStream cluster new metadata leader: %s", srv) 5476 } else { 5477 s.Noticef("JetStream cluster new metadata leader: %s/%s", srv, clst) 5478 } 5479 } 5480 5481 js.mu.Lock() 5482 defer js.mu.Unlock() 5483 5484 if isLeader { 5485 js.startUpdatesSub() 5486 } else { 5487 js.stopUpdatesSub() 5488 // TODO(dlc) - stepdown. 5489 } 5490 5491 // If we have been signaled to check the streams, this is for a bug that left stream 5492 // assignments with no sync subject after an update and no way to sync/catchup outside of the RAFT layer. 5493 if isLeader && js.cluster.streamsCheck { 5494 cc := js.cluster 5495 for acc, asa := range cc.streams { 5496 for _, sa := range asa { 5497 if sa.Sync == _EMPTY_ { 5498 s.Warnf("Stream assigment corrupt for stream '%s > %s'", acc, sa.Config.Name) 5499 nsa := &streamAssignment{Group: sa.Group, Config: sa.Config, Subject: sa.Subject, Reply: sa.Reply, Client: sa.Client} 5500 nsa.Sync = syncSubjForStream() 5501 cc.meta.Propose(encodeUpdateStreamAssignment(nsa)) 5502 } 5503 } 5504 } 5505 // Clear check. 5506 cc.streamsCheck = false 5507 } 5508 } 5509 5510 // Lock should be held. 5511 func (cc *jetStreamCluster) remapStreamAssignment(sa *streamAssignment, removePeer string) bool { 5512 // Invoke placement algo passing RG peers that stay (existing) and the peer that is being removed (ignore) 5513 var retain, ignore []string 5514 for _, v := range sa.Group.Peers { 5515 if v == removePeer { 5516 ignore = append(ignore, v) 5517 } else { 5518 retain = append(retain, v) 5519 } 5520 } 5521 5522 newPeers, placementError := cc.selectPeerGroup(len(sa.Group.Peers), sa.Group.Cluster, sa.Config, retain, 0, ignore) 5523 5524 if placementError == nil { 5525 sa.Group.Peers = newPeers 5526 // Don't influence preferred leader. 5527 sa.Group.Preferred = _EMPTY_ 5528 return true 5529 } 5530 5531 // If R1 just return to avoid bricking the stream. 5532 if sa.Group.node == nil || len(sa.Group.Peers) == 1 { 5533 return false 5534 } 5535 5536 // If we are here let's remove the peer at least, as long as we are R>1 5537 for i, peer := range sa.Group.Peers { 5538 if peer == removePeer { 5539 sa.Group.Peers[i] = sa.Group.Peers[len(sa.Group.Peers)-1] 5540 sa.Group.Peers = sa.Group.Peers[:len(sa.Group.Peers)-1] 5541 break 5542 } 5543 } 5544 return false 5545 } 5546 5547 type selectPeerError struct { 5548 excludeTag bool 5549 offline bool 5550 noStorage bool 5551 uniqueTag bool 5552 misc bool 5553 noJsClust bool 5554 noMatchTags map[string]struct{} 5555 } 5556 5557 func (e *selectPeerError) Error() string { 5558 b := strings.Builder{} 5559 writeBoolErrReason := func(hasErr bool, errMsg string) { 5560 if !hasErr { 5561 return 5562 } 5563 b.WriteString(", ") 5564 b.WriteString(errMsg) 5565 } 5566 b.WriteString("no suitable peers for placement") 5567 writeBoolErrReason(e.offline, "peer offline") 5568 writeBoolErrReason(e.excludeTag, "exclude tag set") 5569 writeBoolErrReason(e.noStorage, "insufficient storage") 5570 writeBoolErrReason(e.uniqueTag, "server tag not unique") 5571 writeBoolErrReason(e.misc, "miscellaneous issue") 5572 writeBoolErrReason(e.noJsClust, "jetstream not enabled in cluster") 5573 if len(e.noMatchTags) != 0 { 5574 b.WriteString(", tags not matched [") 5575 var firstTagWritten bool 5576 for tag := range e.noMatchTags { 5577 if firstTagWritten { 5578 b.WriteString(", ") 5579 } 5580 firstTagWritten = true 5581 b.WriteRune('\'') 5582 b.WriteString(tag) 5583 b.WriteRune('\'') 5584 } 5585 b.WriteString("]") 5586 } 5587 return b.String() 5588 } 5589 5590 func (e *selectPeerError) addMissingTag(t string) { 5591 if e.noMatchTags == nil { 5592 e.noMatchTags = map[string]struct{}{} 5593 } 5594 e.noMatchTags[t] = struct{}{} 5595 } 5596 5597 func (e *selectPeerError) accumulate(eAdd *selectPeerError) { 5598 if eAdd == nil { 5599 return 5600 } 5601 acc := func(val *bool, valAdd bool) { 5602 if valAdd { 5603 *val = valAdd 5604 } 5605 } 5606 acc(&e.offline, eAdd.offline) 5607 acc(&e.excludeTag, eAdd.excludeTag) 5608 acc(&e.noStorage, eAdd.noStorage) 5609 acc(&e.uniqueTag, eAdd.uniqueTag) 5610 acc(&e.misc, eAdd.misc) 5611 acc(&e.noJsClust, eAdd.noJsClust) 5612 for tag := range eAdd.noMatchTags { 5613 e.addMissingTag(tag) 5614 } 5615 } 5616 5617 // selectPeerGroup will select a group of peers to start a raft group. 5618 // when peers exist already the unique tag prefix check for the replaceFirstExisting will be skipped 5619 // js lock should be held. 5620 func (cc *jetStreamCluster) selectPeerGroup(r int, cluster string, cfg *StreamConfig, existing []string, replaceFirstExisting int, ignore []string) ([]string, *selectPeerError) { 5621 if cluster == _EMPTY_ || cfg == nil { 5622 return nil, &selectPeerError{misc: true} 5623 } 5624 5625 var maxBytes uint64 5626 if cfg.MaxBytes > 0 { 5627 maxBytes = uint64(cfg.MaxBytes) 5628 } 5629 5630 // Check for tags. 5631 var tags []string 5632 if cfg.Placement != nil && len(cfg.Placement.Tags) > 0 { 5633 tags = cfg.Placement.Tags 5634 } 5635 5636 // Used for weighted sorting based on availability. 5637 type wn struct { 5638 id string 5639 avail uint64 5640 ha int 5641 ns int 5642 } 5643 5644 var nodes []wn 5645 // peers is a randomized list 5646 s, peers := cc.s, cc.meta.Peers() 5647 5648 uniqueTagPrefix := s.getOpts().JetStreamUniqueTag 5649 if uniqueTagPrefix != _EMPTY_ { 5650 for _, tag := range tags { 5651 if strings.HasPrefix(tag, uniqueTagPrefix) { 5652 // disable uniqueness check if explicitly listed in tags 5653 uniqueTagPrefix = _EMPTY_ 5654 break 5655 } 5656 } 5657 } 5658 var uniqueTags = make(map[string]*nodeInfo) 5659 5660 checkUniqueTag := func(ni *nodeInfo) (bool, *nodeInfo) { 5661 for _, t := range ni.tags { 5662 if strings.HasPrefix(t, uniqueTagPrefix) { 5663 if n, ok := uniqueTags[t]; !ok { 5664 uniqueTags[t] = ni 5665 return true, ni 5666 } else { 5667 return false, n 5668 } 5669 } 5670 } 5671 // default requires the unique prefix to be present 5672 return false, nil 5673 } 5674 5675 // Map existing. 5676 var ep map[string]struct{} 5677 if le := len(existing); le > 0 { 5678 if le >= r { 5679 return existing[:r], nil 5680 } 5681 ep = make(map[string]struct{}) 5682 for i, p := range existing { 5683 ep[p] = struct{}{} 5684 if uniqueTagPrefix == _EMPTY_ { 5685 continue 5686 } 5687 si, ok := s.nodeToInfo.Load(p) 5688 if !ok || si == nil || i < replaceFirstExisting { 5689 continue 5690 } 5691 ni := si.(nodeInfo) 5692 // collect unique tags, but do not require them as this node is already part of the peerset 5693 checkUniqueTag(&ni) 5694 } 5695 } 5696 5697 // Map ignore 5698 var ip map[string]struct{} 5699 if li := len(ignore); li > 0 { 5700 ip = make(map[string]struct{}) 5701 for _, p := range ignore { 5702 ip[p] = struct{}{} 5703 } 5704 } 5705 5706 // Grab the number of streams and HA assets currently assigned to each peer. 5707 // HAAssets under usage is async, so calculate here in realtime based on assignments. 5708 peerStreams := make(map[string]int, len(peers)) 5709 peerHA := make(map[string]int, len(peers)) 5710 for _, asa := range cc.streams { 5711 for _, sa := range asa { 5712 isHA := len(sa.Group.Peers) > 1 5713 for _, peer := range sa.Group.Peers { 5714 peerStreams[peer]++ 5715 if isHA { 5716 peerHA[peer]++ 5717 } 5718 } 5719 } 5720 } 5721 5722 maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets 5723 5724 // An error is a result of multiple individual placement decisions. 5725 // Which is why we keep taps on how often which one happened. 5726 err := selectPeerError{} 5727 5728 // Shuffle them up. 5729 rand.Shuffle(len(peers), func(i, j int) { peers[i], peers[j] = peers[j], peers[i] }) 5730 for _, p := range peers { 5731 si, ok := s.nodeToInfo.Load(p.ID) 5732 if !ok || si == nil { 5733 err.misc = true 5734 continue 5735 } 5736 ni := si.(nodeInfo) 5737 // Only select from the designated named cluster. 5738 if ni.cluster != cluster { 5739 s.Debugf("Peer selection: discard %s@%s reason: not target cluster %s", ni.name, ni.cluster, cluster) 5740 continue 5741 } 5742 5743 // If we know its offline or we do not have config or err don't consider. 5744 if ni.offline || ni.cfg == nil || ni.stats == nil { 5745 s.Debugf("Peer selection: discard %s@%s reason: offline", ni.name, ni.cluster) 5746 err.offline = true 5747 continue 5748 } 5749 5750 // If ignore skip 5751 if _, ok := ip[p.ID]; ok { 5752 continue 5753 } 5754 5755 // If existing also skip, we will add back in to front of the list when done. 5756 if _, ok := ep[p.ID]; ok { 5757 continue 5758 } 5759 5760 if ni.tags.Contains(jsExcludePlacement) { 5761 s.Debugf("Peer selection: discard %s@%s tags: %v reason: %s present", 5762 ni.name, ni.cluster, ni.tags, jsExcludePlacement) 5763 err.excludeTag = true 5764 continue 5765 } 5766 5767 if len(tags) > 0 { 5768 matched := true 5769 for _, t := range tags { 5770 if !ni.tags.Contains(t) { 5771 matched = false 5772 s.Debugf("Peer selection: discard %s@%s tags: %v reason: mandatory tag %s not present", 5773 ni.name, ni.cluster, ni.tags, t) 5774 err.addMissingTag(t) 5775 break 5776 } 5777 } 5778 if !matched { 5779 continue 5780 } 5781 } 5782 5783 var available uint64 5784 if ni.stats != nil { 5785 switch cfg.Storage { 5786 case MemoryStorage: 5787 used := ni.stats.ReservedMemory 5788 if ni.stats.Memory > used { 5789 used = ni.stats.Memory 5790 } 5791 if ni.cfg.MaxMemory > int64(used) { 5792 available = uint64(ni.cfg.MaxMemory) - used 5793 } 5794 case FileStorage: 5795 used := ni.stats.ReservedStore 5796 if ni.stats.Store > used { 5797 used = ni.stats.Store 5798 } 5799 if ni.cfg.MaxStore > int64(used) { 5800 available = uint64(ni.cfg.MaxStore) - used 5801 } 5802 } 5803 } 5804 5805 // Otherwise check if we have enough room if maxBytes set. 5806 if maxBytes > 0 && maxBytes > available { 5807 s.Warnf("Peer selection: discard %s@%s (Max Bytes: %d) exceeds available %s storage of %d bytes", 5808 ni.name, ni.cluster, maxBytes, cfg.Storage.String(), available) 5809 err.noStorage = true 5810 continue 5811 } 5812 // HAAssets contain _meta_ which we want to ignore, hence > and not >=. 5813 if maxHaAssets > 0 && ni.stats != nil && ni.stats.HAAssets > maxHaAssets { 5814 s.Warnf("Peer selection: discard %s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d for stream placement", 5815 ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets) 5816 err.misc = true 5817 continue 5818 } 5819 5820 if uniqueTagPrefix != _EMPTY_ { 5821 if unique, owner := checkUniqueTag(&ni); !unique { 5822 if owner != nil { 5823 s.Debugf("Peer selection: discard %s@%s tags:%v reason: unique prefix %s owned by %s@%s", 5824 ni.name, ni.cluster, ni.tags, owner.name, owner.cluster) 5825 } else { 5826 s.Debugf("Peer selection: discard %s@%s tags:%v reason: unique prefix %s not present", 5827 ni.name, ni.cluster, ni.tags) 5828 } 5829 err.uniqueTag = true 5830 continue 5831 } 5832 } 5833 // Add to our list of potential nodes. 5834 nodes = append(nodes, wn{p.ID, available, peerHA[p.ID], peerStreams[p.ID]}) 5835 } 5836 5837 // If we could not select enough peers, fail. 5838 if len(nodes) < (r - len(existing)) { 5839 s.Debugf("Peer selection: required %d nodes but found %d (cluster: %s replica: %d existing: %v/%d peers: %d result-peers: %d err: %+v)", 5840 (r - len(existing)), len(nodes), cluster, r, existing, replaceFirstExisting, len(peers), len(nodes), err) 5841 if len(peers) == 0 { 5842 err.noJsClust = true 5843 } 5844 return nil, &err 5845 } 5846 // Sort based on available from most to least, breaking ties by number of total streams assigned to the peer. 5847 sort.Slice(nodes, func(i, j int) bool { 5848 if nodes[i].avail == nodes[j].avail { 5849 return nodes[i].ns < nodes[j].ns 5850 } 5851 return nodes[i].avail > nodes[j].avail 5852 }) 5853 // If we are placing a replicated stream, let's sort based on HAAssets, as that is more important to balance. 5854 if cfg.Replicas > 1 { 5855 sort.SliceStable(nodes, func(i, j int) bool { return nodes[i].ha < nodes[j].ha }) 5856 } 5857 5858 var results []string 5859 if len(existing) > 0 { 5860 results = append(results, existing...) 5861 r -= len(existing) 5862 } 5863 for _, r := range nodes[:r] { 5864 results = append(results, r.id) 5865 } 5866 return results, nil 5867 } 5868 5869 func groupNameForStream(peers []string, storage StorageType) string { 5870 return groupName("S", peers, storage) 5871 } 5872 5873 func groupNameForConsumer(peers []string, storage StorageType) string { 5874 return groupName("C", peers, storage) 5875 } 5876 5877 func groupName(prefix string, peers []string, storage StorageType) string { 5878 gns := getHash(nuid.Next()) 5879 return fmt.Sprintf("%s-R%d%s-%s", prefix, len(peers), storage.String()[:1], gns) 5880 } 5881 5882 // returns stream count for this tier as well as applicable reservation size (not including reservations for cfg) 5883 // jetStream read lock should be held 5884 func tieredStreamAndReservationCount(asa map[string]*streamAssignment, tier string, cfg *StreamConfig) (int, int64) { 5885 var numStreams int 5886 var reservation int64 5887 for _, sa := range asa { 5888 if tier == _EMPTY_ || isSameTier(sa.Config, cfg) { 5889 numStreams++ 5890 if sa.Config.MaxBytes > 0 && sa.Config.Storage == cfg.Storage && sa.Config.Name != cfg.Name { 5891 // If tier is empty, all storage is flat and we should adjust for replicas. 5892 // Otherwise if tiered, storage replication already taken into consideration. 5893 if tier == _EMPTY_ && cfg.Replicas > 1 { 5894 reservation += sa.Config.MaxBytes * int64(cfg.Replicas) 5895 } else { 5896 reservation += sa.Config.MaxBytes 5897 } 5898 } 5899 } 5900 } 5901 return numStreams, reservation 5902 } 5903 5904 // createGroupForStream will create a group for assignment for the stream. 5905 // Lock should be held. 5906 func (js *jetStream) createGroupForStream(ci *ClientInfo, cfg *StreamConfig) (*raftGroup, *selectPeerError) { 5907 replicas := cfg.Replicas 5908 if replicas == 0 { 5909 replicas = 1 5910 } 5911 5912 // Default connected cluster from the request origin. 5913 cc, cluster := js.cluster, ci.Cluster 5914 // If specified, override the default. 5915 clusterDefined := cfg.Placement != nil && cfg.Placement.Cluster != _EMPTY_ 5916 if clusterDefined { 5917 cluster = cfg.Placement.Cluster 5918 } 5919 clusters := []string{cluster} 5920 if !clusterDefined { 5921 clusters = append(clusters, ci.Alternates...) 5922 } 5923 5924 // Need to create a group here. 5925 errs := &selectPeerError{} 5926 for _, cn := range clusters { 5927 peers, err := cc.selectPeerGroup(replicas, cn, cfg, nil, 0, nil) 5928 if len(peers) < replicas { 5929 errs.accumulate(err) 5930 continue 5931 } 5932 return &raftGroup{Name: groupNameForStream(peers, cfg.Storage), Storage: cfg.Storage, Peers: peers, Cluster: cn}, nil 5933 } 5934 return nil, errs 5935 } 5936 5937 func (acc *Account) selectLimits(cfg *StreamConfig) (*JetStreamAccountLimits, string, *jsAccount, *ApiError) { 5938 // Grab our jetstream account info. 5939 acc.mu.RLock() 5940 jsa := acc.js 5941 acc.mu.RUnlock() 5942 5943 if jsa == nil { 5944 return nil, _EMPTY_, nil, NewJSNotEnabledForAccountError() 5945 } 5946 5947 jsa.usageMu.RLock() 5948 selectedLimits, tierName, ok := jsa.selectLimits(cfg) 5949 jsa.usageMu.RUnlock() 5950 5951 if !ok { 5952 return nil, _EMPTY_, nil, NewJSNoLimitsError() 5953 } 5954 return &selectedLimits, tierName, jsa, nil 5955 } 5956 5957 // Read lock needs to be held 5958 func (js *jetStream) jsClusteredStreamLimitsCheck(acc *Account, cfg *StreamConfig) *ApiError { 5959 selectedLimits, tier, _, apiErr := acc.selectLimits(cfg) 5960 if apiErr != nil { 5961 return apiErr 5962 } 5963 5964 asa := js.cluster.streams[acc.Name] 5965 numStreams, reservations := tieredStreamAndReservationCount(asa, tier, cfg) 5966 // Check for inflight proposals... 5967 if cc := js.cluster; cc != nil && cc.inflight != nil { 5968 numStreams += len(cc.inflight[acc.Name]) 5969 } 5970 if selectedLimits.MaxStreams > 0 && numStreams >= selectedLimits.MaxStreams { 5971 return NewJSMaximumStreamsLimitError() 5972 } 5973 // Check for account limits here before proposing. 5974 if err := js.checkAccountLimits(selectedLimits, cfg, reservations); err != nil { 5975 return NewJSStreamLimitsError(err, Unless(err)) 5976 } 5977 return nil 5978 } 5979 5980 func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, config *StreamConfig) { 5981 js, cc := s.getJetStreamCluster() 5982 if js == nil || cc == nil { 5983 return 5984 } 5985 5986 var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}} 5987 5988 ccfg, apiErr := s.checkStreamCfg(config, acc) 5989 if apiErr != nil { 5990 resp.Error = apiErr 5991 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 5992 return 5993 } 5994 cfg := &ccfg 5995 5996 // Now process the request and proposal. 5997 js.mu.Lock() 5998 defer js.mu.Unlock() 5999 6000 var self *streamAssignment 6001 var rg *raftGroup 6002 6003 // Capture if we have existing assignment first. 6004 if osa := js.streamAssignment(acc.Name, cfg.Name); osa != nil { 6005 if !reflect.DeepEqual(osa.Config, cfg) { 6006 resp.Error = NewJSStreamNameExistError() 6007 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6008 return 6009 } 6010 // This is an equal assignment. 6011 self, rg = osa, osa.Group 6012 } 6013 6014 if cfg.Sealed { 6015 resp.Error = NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration for create can not be sealed")) 6016 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6017 return 6018 } 6019 6020 // Check for subject collisions here. 6021 if cc.subjectsOverlap(acc.Name, cfg.Subjects, self) { 6022 resp.Error = NewJSStreamSubjectOverlapError() 6023 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6024 return 6025 } 6026 6027 apiErr = js.jsClusteredStreamLimitsCheck(acc, cfg) 6028 // Check for stream limits here before proposing. These need to be tracked from meta layer, not jsa. 6029 if apiErr != nil { 6030 resp.Error = apiErr 6031 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6032 return 6033 } 6034 6035 // Raft group selection and placement. 6036 if rg == nil { 6037 // Check inflight before proposing in case we have an existing inflight proposal. 6038 if cc.inflight == nil { 6039 cc.inflight = make(map[string]map[string]*raftGroup) 6040 } 6041 streams, ok := cc.inflight[acc.Name] 6042 if !ok { 6043 streams = make(map[string]*raftGroup) 6044 cc.inflight[acc.Name] = streams 6045 } else if existing, ok := streams[cfg.Name]; ok { 6046 // We have existing for same stream. Re-use same group. 6047 rg = existing 6048 } 6049 } 6050 // Create a new one here if needed. 6051 if rg == nil { 6052 nrg, err := js.createGroupForStream(ci, cfg) 6053 if err != nil { 6054 resp.Error = NewJSClusterNoPeersError(err) 6055 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6056 return 6057 } 6058 rg = nrg 6059 // Pick a preferred leader. 6060 rg.setPreferred() 6061 } 6062 6063 // Sync subject for post snapshot sync. 6064 sa := &streamAssignment{Group: rg, Sync: syncSubjForStream(), Config: cfg, Subject: subject, Reply: reply, Client: ci, Created: time.Now().UTC()} 6065 if err := cc.meta.Propose(encodeAddStreamAssignment(sa)); err == nil { 6066 // On success, add this as an inflight proposal so we can apply limits 6067 // on concurrent create requests while this stream assignment has 6068 // possibly not been processed yet. 6069 if streams, ok := cc.inflight[acc.Name]; ok { 6070 streams[cfg.Name] = rg 6071 } 6072 } 6073 } 6074 6075 var ( 6076 errReqTimeout = errors.New("timeout while waiting for response") 6077 errReqSrvExit = errors.New("server shutdown while waiting for response") 6078 ) 6079 6080 // blocking utility call to perform requests on the system account 6081 // returns (synchronized) v or error 6082 func sysRequest[T any](s *Server, subjFormat string, args ...any) (*T, error) { 6083 isubj := fmt.Sprintf(subjFormat, args...) 6084 6085 s.mu.Lock() 6086 inbox := s.newRespInbox() 6087 results := make(chan *T, 1) 6088 s.sys.replies[inbox] = func(_ *subscription, _ *client, _ *Account, _, _ string, msg []byte) { 6089 var v T 6090 if err := json.Unmarshal(msg, &v); err != nil { 6091 s.Warnf("Error unmarshalling response for request '%s':%v", isubj, err) 6092 return 6093 } 6094 select { 6095 case results <- &v: 6096 default: 6097 s.Warnf("Failed placing request response on internal channel") 6098 } 6099 } 6100 s.mu.Unlock() 6101 6102 s.sendInternalMsgLocked(isubj, inbox, nil, nil) 6103 6104 defer func() { 6105 s.mu.Lock() 6106 defer s.mu.Unlock() 6107 if s.sys != nil && s.sys.replies != nil { 6108 delete(s.sys.replies, inbox) 6109 } 6110 }() 6111 6112 ttl := time.NewTimer(2 * time.Second) 6113 defer ttl.Stop() 6114 6115 select { 6116 case <-s.quitCh: 6117 return nil, errReqSrvExit 6118 case <-ttl.C: 6119 return nil, errReqTimeout 6120 case data := <-results: 6121 return data, nil 6122 } 6123 } 6124 6125 func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, cfg *StreamConfig, peerSet []string) { 6126 js, cc := s.getJetStreamCluster() 6127 if js == nil || cc == nil { 6128 return 6129 } 6130 6131 // Now process the request and proposal. 6132 js.mu.Lock() 6133 defer js.mu.Unlock() 6134 meta := cc.meta 6135 if meta == nil { 6136 return 6137 } 6138 6139 var resp = JSApiStreamUpdateResponse{ApiResponse: ApiResponse{Type: JSApiStreamUpdateResponseType}} 6140 6141 osa := js.streamAssignment(acc.Name, cfg.Name) 6142 6143 if osa == nil { 6144 resp.Error = NewJSStreamNotFoundError() 6145 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6146 return 6147 } 6148 var newCfg *StreamConfig 6149 if jsa := js.accounts[acc.Name]; jsa != nil { 6150 js.mu.Unlock() 6151 ncfg, err := jsa.configUpdateCheck(osa.Config, cfg, s) 6152 js.mu.Lock() 6153 if err != nil { 6154 resp.Error = NewJSStreamUpdateError(err, Unless(err)) 6155 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6156 return 6157 } else { 6158 newCfg = ncfg 6159 } 6160 } else { 6161 resp.Error = NewJSNotEnabledForAccountError() 6162 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6163 return 6164 } 6165 // Check for mirror changes which are not allowed. 6166 if !reflect.DeepEqual(newCfg.Mirror, osa.Config.Mirror) { 6167 resp.Error = NewJSStreamMirrorNotUpdatableError() 6168 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6169 return 6170 } 6171 6172 // Check for subject collisions here. 6173 if cc.subjectsOverlap(acc.Name, cfg.Subjects, osa) { 6174 resp.Error = NewJSStreamSubjectOverlapError() 6175 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6176 return 6177 } 6178 6179 // Make copy so to not change original. 6180 rg := osa.copyGroup().Group 6181 6182 // Check for a move request. 6183 var isMoveRequest, isMoveCancel bool 6184 if lPeerSet := len(peerSet); lPeerSet > 0 { 6185 isMoveRequest = true 6186 // check if this is a cancellation 6187 if lPeerSet == osa.Config.Replicas && lPeerSet <= len(rg.Peers) { 6188 isMoveCancel = true 6189 // can only be a cancellation if the peer sets overlap as expected 6190 for i := 0; i < lPeerSet; i++ { 6191 if peerSet[i] != rg.Peers[i] { 6192 isMoveCancel = false 6193 break 6194 } 6195 } 6196 } 6197 } else { 6198 isMoveRequest = newCfg.Placement != nil && !reflect.DeepEqual(osa.Config.Placement, newCfg.Placement) 6199 } 6200 6201 // Check for replica changes. 6202 isReplicaChange := newCfg.Replicas != osa.Config.Replicas 6203 6204 // We stage consumer updates and do them after the stream update. 6205 var consumers []*consumerAssignment 6206 6207 // Check if this is a move request, but no cancellation, and we are already moving this stream. 6208 if isMoveRequest && !isMoveCancel && osa.Config.Replicas != len(rg.Peers) { 6209 // obtain stats to include in error message 6210 msg := _EMPTY_ 6211 if s.allPeersOffline(rg) { 6212 msg = fmt.Sprintf("all %d peers offline", len(rg.Peers)) 6213 } else { 6214 // Need to release js lock. 6215 js.mu.Unlock() 6216 if si, err := sysRequest[StreamInfo](s, clusterStreamInfoT, ci.serviceAccount(), cfg.Name); err != nil { 6217 msg = fmt.Sprintf("error retrieving info: %s", err.Error()) 6218 } else if si != nil { 6219 currentCount := 0 6220 if si.Cluster.Leader != _EMPTY_ { 6221 currentCount++ 6222 } 6223 combinedLag := uint64(0) 6224 for _, r := range si.Cluster.Replicas { 6225 if r.Current { 6226 currentCount++ 6227 } 6228 combinedLag += r.Lag 6229 } 6230 msg = fmt.Sprintf("total peers: %d, current peers: %d, combined lag: %d", 6231 len(rg.Peers), currentCount, combinedLag) 6232 } 6233 // Re-acquire here. 6234 js.mu.Lock() 6235 } 6236 resp.Error = NewJSStreamMoveInProgressError(msg) 6237 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6238 return 6239 } 6240 6241 // Can not move and scale at same time. 6242 if isMoveRequest && isReplicaChange { 6243 resp.Error = NewJSStreamMoveAndScaleError() 6244 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6245 return 6246 } 6247 6248 if isReplicaChange { 6249 // We are adding new peers here. 6250 if newCfg.Replicas > len(rg.Peers) { 6251 // Check that we have the allocation available. 6252 if err := js.jsClusteredStreamLimitsCheck(acc, newCfg); err != nil { 6253 resp.Error = err 6254 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6255 return 6256 } 6257 // Check if we do not have a cluster assigned, and if we do not make sure we 6258 // try to pick one. This could happen with older streams that were assigned by 6259 // previous servers. 6260 if rg.Cluster == _EMPTY_ { 6261 // Prefer placement directrives if we have them. 6262 if newCfg.Placement != nil && newCfg.Placement.Cluster != _EMPTY_ { 6263 rg.Cluster = newCfg.Placement.Cluster 6264 } else { 6265 // Fall back to the cluster assignment from the client. 6266 rg.Cluster = ci.Cluster 6267 } 6268 } 6269 peers, err := cc.selectPeerGroup(newCfg.Replicas, rg.Cluster, newCfg, rg.Peers, 0, nil) 6270 if err != nil { 6271 resp.Error = NewJSClusterNoPeersError(err) 6272 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6273 return 6274 } 6275 // Single nodes are not recorded by the NRG layer so we can rename. 6276 if len(peers) == 1 { 6277 rg.Name = groupNameForStream(peers, rg.Storage) 6278 } else if len(rg.Peers) == 1 { 6279 // This is scale up from being a singelton, set preferred to that singelton. 6280 rg.Preferred = rg.Peers[0] 6281 } 6282 rg.Peers = peers 6283 } else { 6284 // We are deleting nodes here. We want to do our best to preserve the current leader. 6285 // We have support now from above that guarantees we are in our own Go routine, so can 6286 // ask for stream info from the stream leader to make sure we keep the leader in the new list. 6287 var curLeader string 6288 if !s.allPeersOffline(rg) { 6289 // Need to release js lock. 6290 js.mu.Unlock() 6291 if si, err := sysRequest[StreamInfo](s, clusterStreamInfoT, ci.serviceAccount(), cfg.Name); err != nil { 6292 s.Warnf("Did not receive stream info results for '%s > %s' due to: %s", acc, cfg.Name, err) 6293 } else if si != nil { 6294 if cl := si.Cluster; cl != nil && cl.Leader != _EMPTY_ { 6295 curLeader = getHash(cl.Leader) 6296 } 6297 } 6298 // Re-acquire here. 6299 js.mu.Lock() 6300 } 6301 // If we identified a leader make sure its part of the new group. 6302 selected := make([]string, 0, newCfg.Replicas) 6303 6304 if curLeader != _EMPTY_ { 6305 selected = append(selected, curLeader) 6306 } 6307 for _, peer := range rg.Peers { 6308 if len(selected) == newCfg.Replicas { 6309 break 6310 } 6311 if peer == curLeader { 6312 continue 6313 } 6314 if si, ok := s.nodeToInfo.Load(peer); ok && si != nil { 6315 if si.(nodeInfo).offline { 6316 continue 6317 } 6318 selected = append(selected, peer) 6319 } 6320 } 6321 rg.Peers = selected 6322 } 6323 6324 // Need to remap any consumers. 6325 for _, ca := range osa.consumers { 6326 // Ephemerals are R=1, so only auto-remap durables, or R>1, unless stream is interest or workqueue policy. 6327 numPeers := len(ca.Group.Peers) 6328 if ca.Config.Durable != _EMPTY_ || numPeers > 1 || cfg.Retention != LimitsPolicy { 6329 cca := ca.copyGroup() 6330 // Adjust preferred as needed. 6331 if numPeers == 1 && len(rg.Peers) > 1 { 6332 cca.Group.Preferred = ca.Group.Peers[0] 6333 } else { 6334 cca.Group.Preferred = _EMPTY_ 6335 } 6336 // Assign new peers. 6337 cca.Group.Peers = rg.Peers 6338 // We can not propose here before the stream itself so we collect them. 6339 consumers = append(consumers, cca) 6340 } 6341 } 6342 } else if isMoveRequest { 6343 if len(peerSet) == 0 { 6344 nrg, err := js.createGroupForStream(ci, newCfg) 6345 if err != nil { 6346 resp.Error = NewJSClusterNoPeersError(err) 6347 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6348 return 6349 } 6350 // filter peers present in both sets 6351 for _, peer := range rg.Peers { 6352 found := false 6353 for _, newPeer := range nrg.Peers { 6354 if peer == newPeer { 6355 found = true 6356 break 6357 } 6358 } 6359 if !found { 6360 peerSet = append(peerSet, peer) 6361 } 6362 } 6363 peerSet = append(peerSet, nrg.Peers...) 6364 } 6365 if len(rg.Peers) == 1 { 6366 rg.Preferred = peerSet[0] 6367 } 6368 rg.Peers = peerSet 6369 6370 for _, ca := range osa.consumers { 6371 cca := ca.copyGroup() 6372 r := cca.Config.replicas(osa.Config) 6373 // shuffle part of cluster peer set we will be keeping 6374 randPeerSet := copyStrings(peerSet[len(peerSet)-newCfg.Replicas:]) 6375 rand.Shuffle(newCfg.Replicas, func(i, j int) { randPeerSet[i], randPeerSet[j] = randPeerSet[j], randPeerSet[i] }) 6376 // move overlapping peers at the end of randPeerSet and keep a tally of non overlapping peers 6377 dropPeerSet := make([]string, 0, len(cca.Group.Peers)) 6378 for _, p := range cca.Group.Peers { 6379 found := false 6380 for i, rp := range randPeerSet { 6381 if p == rp { 6382 randPeerSet[i] = randPeerSet[newCfg.Replicas-1] 6383 randPeerSet[newCfg.Replicas-1] = p 6384 found = true 6385 break 6386 } 6387 } 6388 if !found { 6389 dropPeerSet = append(dropPeerSet, p) 6390 } 6391 } 6392 cPeerSet := randPeerSet[newCfg.Replicas-r:] 6393 // In case of a set or cancel simply assign 6394 if len(peerSet) == newCfg.Replicas { 6395 cca.Group.Peers = cPeerSet 6396 } else { 6397 cca.Group.Peers = append(dropPeerSet, cPeerSet...) 6398 } 6399 // make sure it overlaps with peers and remove if not 6400 if cca.Group.Preferred != _EMPTY_ { 6401 found := false 6402 for _, p := range cca.Group.Peers { 6403 if p == cca.Group.Preferred { 6404 found = true 6405 break 6406 } 6407 } 6408 if !found { 6409 cca.Group.Preferred = _EMPTY_ 6410 } 6411 } 6412 // We can not propose here before the stream itself so we collect them. 6413 consumers = append(consumers, cca) 6414 } 6415 } else { 6416 // All other updates make sure no preferred is set. 6417 rg.Preferred = _EMPTY_ 6418 } 6419 6420 sa := &streamAssignment{Group: rg, Sync: osa.Sync, Created: osa.Created, Config: newCfg, Subject: subject, Reply: reply, Client: ci} 6421 meta.Propose(encodeUpdateStreamAssignment(sa)) 6422 6423 // Process any staged consumers. 6424 for _, ca := range consumers { 6425 meta.Propose(encodeAddConsumerAssignment(ca)) 6426 } 6427 } 6428 6429 func (s *Server) jsClusteredStreamDeleteRequest(ci *ClientInfo, acc *Account, stream, subject, reply string, rmsg []byte) { 6430 js, cc := s.getJetStreamCluster() 6431 if js == nil || cc == nil { 6432 return 6433 } 6434 6435 js.mu.Lock() 6436 defer js.mu.Unlock() 6437 6438 if cc.meta == nil { 6439 return 6440 } 6441 6442 osa := js.streamAssignment(acc.Name, stream) 6443 if osa == nil { 6444 var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}} 6445 resp.Error = NewJSStreamNotFoundError() 6446 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6447 return 6448 } 6449 6450 sa := &streamAssignment{Group: osa.Group, Config: osa.Config, Subject: subject, Reply: reply, Client: ci} 6451 cc.meta.Propose(encodeDeleteStreamAssignment(sa)) 6452 } 6453 6454 // Process a clustered purge request. 6455 func (s *Server) jsClusteredStreamPurgeRequest( 6456 ci *ClientInfo, 6457 acc *Account, 6458 mset *stream, 6459 stream, subject, reply string, 6460 rmsg []byte, 6461 preq *JSApiStreamPurgeRequest, 6462 ) { 6463 js, cc := s.getJetStreamCluster() 6464 if js == nil || cc == nil { 6465 return 6466 } 6467 6468 js.mu.Lock() 6469 sa := js.streamAssignment(acc.Name, stream) 6470 if sa == nil { 6471 resp := JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}} 6472 resp.Error = NewJSStreamNotFoundError() 6473 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6474 js.mu.Unlock() 6475 return 6476 } 6477 6478 if n := sa.Group.node; n != nil { 6479 sp := &streamPurge{Stream: stream, LastSeq: mset.state().LastSeq, Subject: subject, Reply: reply, Client: ci, Request: preq} 6480 n.Propose(encodeStreamPurge(sp)) 6481 js.mu.Unlock() 6482 return 6483 } 6484 js.mu.Unlock() 6485 6486 if mset == nil { 6487 return 6488 } 6489 6490 var resp = JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}} 6491 purged, err := mset.purge(preq) 6492 if err != nil { 6493 resp.Error = NewJSStreamGeneralError(err, Unless(err)) 6494 } else { 6495 resp.Purged = purged 6496 resp.Success = true 6497 } 6498 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6499 } 6500 6501 func (s *Server) jsClusteredStreamRestoreRequest( 6502 ci *ClientInfo, 6503 acc *Account, 6504 req *JSApiStreamRestoreRequest, 6505 subject, reply string, rmsg []byte) { 6506 6507 js, cc := s.getJetStreamCluster() 6508 if js == nil || cc == nil { 6509 return 6510 } 6511 6512 js.mu.Lock() 6513 defer js.mu.Unlock() 6514 6515 if cc.meta == nil { 6516 return 6517 } 6518 6519 cfg := &req.Config 6520 resp := JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}} 6521 6522 if err := js.jsClusteredStreamLimitsCheck(acc, cfg); err != nil { 6523 resp.Error = err 6524 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6525 return 6526 } 6527 6528 if sa := js.streamAssignment(ci.serviceAccount(), cfg.Name); sa != nil { 6529 resp.Error = NewJSStreamNameExistRestoreFailedError() 6530 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6531 return 6532 } 6533 6534 // Raft group selection and placement. 6535 rg, err := js.createGroupForStream(ci, cfg) 6536 if err != nil { 6537 resp.Error = NewJSClusterNoPeersError(err) 6538 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6539 return 6540 } 6541 // Pick a preferred leader. 6542 rg.setPreferred() 6543 sa := &streamAssignment{Group: rg, Sync: syncSubjForStream(), Config: cfg, Subject: subject, Reply: reply, Client: ci, Created: time.Now().UTC()} 6544 // Now add in our restore state and pre-select a peer to handle the actual receipt of the snapshot. 6545 sa.Restore = &req.State 6546 cc.meta.Propose(encodeAddStreamAssignment(sa)) 6547 } 6548 6549 // Determine if all peers for this group are offline. 6550 func (s *Server) allPeersOffline(rg *raftGroup) bool { 6551 if rg == nil { 6552 return false 6553 } 6554 // Check to see if this stream has any servers online to respond. 6555 for _, peer := range rg.Peers { 6556 if si, ok := s.nodeToInfo.Load(peer); ok && si != nil { 6557 if !si.(nodeInfo).offline { 6558 return false 6559 } 6560 } 6561 } 6562 return true 6563 } 6564 6565 // This will do a scatter and gather operation for all streams for this account. This is only called from metadata leader. 6566 // This will be running in a separate Go routine. 6567 func (s *Server) jsClusteredStreamListRequest(acc *Account, ci *ClientInfo, filter string, offset int, subject, reply string, rmsg []byte) { 6568 defer s.grWG.Done() 6569 6570 js, cc := s.getJetStreamCluster() 6571 if js == nil || cc == nil { 6572 return 6573 } 6574 6575 js.mu.RLock() 6576 6577 var streams []*streamAssignment 6578 for _, sa := range cc.streams[acc.Name] { 6579 if IsNatsErr(sa.err, JSClusterNotAssignedErr) { 6580 continue 6581 } 6582 6583 if filter != _EMPTY_ { 6584 // These could not have subjects auto-filled in since they are raw and unprocessed. 6585 if len(sa.Config.Subjects) == 0 { 6586 if SubjectsCollide(filter, sa.Config.Name) { 6587 streams = append(streams, sa) 6588 } 6589 } else { 6590 for _, subj := range sa.Config.Subjects { 6591 if SubjectsCollide(filter, subj) { 6592 streams = append(streams, sa) 6593 break 6594 } 6595 } 6596 } 6597 } else { 6598 streams = append(streams, sa) 6599 } 6600 } 6601 6602 // Needs to be sorted for offsets etc. 6603 if len(streams) > 1 { 6604 sort.Slice(streams, func(i, j int) bool { 6605 return strings.Compare(streams[i].Config.Name, streams[j].Config.Name) < 0 6606 }) 6607 } 6608 6609 scnt := len(streams) 6610 if offset > scnt { 6611 offset = scnt 6612 } 6613 if offset > 0 { 6614 streams = streams[offset:] 6615 } 6616 if len(streams) > JSApiListLimit { 6617 streams = streams[:JSApiListLimit] 6618 } 6619 6620 var resp = JSApiStreamListResponse{ 6621 ApiResponse: ApiResponse{Type: JSApiStreamListResponseType}, 6622 Streams: make([]*StreamInfo, 0, len(streams)), 6623 } 6624 6625 js.mu.RUnlock() 6626 6627 if len(streams) == 0 { 6628 resp.Limit = JSApiListLimit 6629 resp.Offset = offset 6630 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6631 return 6632 } 6633 6634 // Create an inbox for our responses and send out our requests. 6635 s.mu.Lock() 6636 inbox := s.newRespInbox() 6637 rc := make(chan *StreamInfo, len(streams)) 6638 6639 // Store our handler. 6640 s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) { 6641 var si StreamInfo 6642 if err := json.Unmarshal(msg, &si); err != nil { 6643 s.Warnf("Error unmarshalling clustered stream info response:%v", err) 6644 return 6645 } 6646 select { 6647 case rc <- &si: 6648 default: 6649 s.Warnf("Failed placing remote stream info result on internal channel") 6650 } 6651 } 6652 s.mu.Unlock() 6653 6654 // Cleanup after. 6655 defer func() { 6656 s.mu.Lock() 6657 if s.sys != nil && s.sys.replies != nil { 6658 delete(s.sys.replies, inbox) 6659 } 6660 s.mu.Unlock() 6661 }() 6662 6663 var missingNames []string 6664 sent := map[string]int{} 6665 6666 // Send out our requests here. 6667 js.mu.RLock() 6668 for _, sa := range streams { 6669 if s.allPeersOffline(sa.Group) { 6670 // Place offline onto our results by hand here. 6671 si := &StreamInfo{ 6672 Config: *sa.Config, 6673 Created: sa.Created, 6674 Cluster: js.offlineClusterInfo(sa.Group), 6675 TimeStamp: time.Now().UTC(), 6676 } 6677 resp.Streams = append(resp.Streams, si) 6678 missingNames = append(missingNames, sa.Config.Name) 6679 } else { 6680 isubj := fmt.Sprintf(clusterStreamInfoT, sa.Client.serviceAccount(), sa.Config.Name) 6681 s.sendInternalMsgLocked(isubj, inbox, nil, nil) 6682 sent[sa.Config.Name] = len(sa.consumers) 6683 } 6684 } 6685 // Don't hold lock. 6686 js.mu.RUnlock() 6687 6688 const timeout = 4 * time.Second 6689 notActive := time.NewTimer(timeout) 6690 defer notActive.Stop() 6691 6692 LOOP: 6693 for len(sent) > 0 { 6694 select { 6695 case <-s.quitCh: 6696 return 6697 case <-notActive.C: 6698 s.Warnf("Did not receive all stream info results for %q", acc) 6699 for sName := range sent { 6700 missingNames = append(missingNames, sName) 6701 } 6702 break LOOP 6703 case si := <-rc: 6704 consCount := sent[si.Config.Name] 6705 if consCount > 0 { 6706 si.State.Consumers = consCount 6707 } 6708 delete(sent, si.Config.Name) 6709 resp.Streams = append(resp.Streams, si) 6710 // Check to see if we are done. 6711 if len(resp.Streams) == len(streams) { 6712 break LOOP 6713 } 6714 } 6715 } 6716 6717 // Needs to be sorted as well. 6718 if len(resp.Streams) > 1 { 6719 sort.Slice(resp.Streams, func(i, j int) bool { 6720 return strings.Compare(resp.Streams[i].Config.Name, resp.Streams[j].Config.Name) < 0 6721 }) 6722 } 6723 6724 resp.Total = scnt 6725 resp.Limit = JSApiListLimit 6726 resp.Offset = offset 6727 resp.Missing = missingNames 6728 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6729 } 6730 6731 // This will do a scatter and gather operation for all consumers for this stream and account. 6732 // This will be running in a separate Go routine. 6733 func (s *Server) jsClusteredConsumerListRequest(acc *Account, ci *ClientInfo, offset int, stream, subject, reply string, rmsg []byte) { 6734 defer s.grWG.Done() 6735 6736 js, cc := s.getJetStreamCluster() 6737 if js == nil || cc == nil { 6738 return 6739 } 6740 6741 js.mu.RLock() 6742 6743 var consumers []*consumerAssignment 6744 if sas := cc.streams[acc.Name]; sas != nil { 6745 if sa := sas[stream]; sa != nil { 6746 // Copy over since we need to sort etc. 6747 for _, ca := range sa.consumers { 6748 consumers = append(consumers, ca) 6749 } 6750 } 6751 } 6752 // Needs to be sorted. 6753 if len(consumers) > 1 { 6754 sort.Slice(consumers, func(i, j int) bool { 6755 return strings.Compare(consumers[i].Name, consumers[j].Name) < 0 6756 }) 6757 } 6758 6759 ocnt := len(consumers) 6760 if offset > ocnt { 6761 offset = ocnt 6762 } 6763 if offset > 0 { 6764 consumers = consumers[offset:] 6765 } 6766 if len(consumers) > JSApiListLimit { 6767 consumers = consumers[:JSApiListLimit] 6768 } 6769 6770 // Send out our requests here. 6771 var resp = JSApiConsumerListResponse{ 6772 ApiResponse: ApiResponse{Type: JSApiConsumerListResponseType}, 6773 Consumers: []*ConsumerInfo{}, 6774 } 6775 6776 js.mu.RUnlock() 6777 6778 if len(consumers) == 0 { 6779 resp.Limit = JSApiListLimit 6780 resp.Offset = offset 6781 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6782 return 6783 } 6784 6785 // Create an inbox for our responses and send out requests. 6786 s.mu.Lock() 6787 inbox := s.newRespInbox() 6788 rc := make(chan *ConsumerInfo, len(consumers)) 6789 6790 // Store our handler. 6791 s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) { 6792 var ci ConsumerInfo 6793 if err := json.Unmarshal(msg, &ci); err != nil { 6794 s.Warnf("Error unmarshaling clustered consumer info response:%v", err) 6795 return 6796 } 6797 select { 6798 case rc <- &ci: 6799 default: 6800 s.Warnf("Failed placing consumer info result on internal chan") 6801 } 6802 } 6803 s.mu.Unlock() 6804 6805 // Cleanup after. 6806 defer func() { 6807 s.mu.Lock() 6808 if s.sys != nil && s.sys.replies != nil { 6809 delete(s.sys.replies, inbox) 6810 } 6811 s.mu.Unlock() 6812 }() 6813 6814 var missingNames []string 6815 sent := map[string]struct{}{} 6816 6817 // Send out our requests here. 6818 js.mu.RLock() 6819 for _, ca := range consumers { 6820 if s.allPeersOffline(ca.Group) { 6821 // Place offline onto our results by hand here. 6822 ci := &ConsumerInfo{ 6823 Config: ca.Config, 6824 Created: ca.Created, 6825 Cluster: js.offlineClusterInfo(ca.Group), 6826 TimeStamp: time.Now().UTC(), 6827 } 6828 resp.Consumers = append(resp.Consumers, ci) 6829 missingNames = append(missingNames, ca.Name) 6830 } else { 6831 isubj := fmt.Sprintf(clusterConsumerInfoT, ca.Client.serviceAccount(), stream, ca.Name) 6832 s.sendInternalMsgLocked(isubj, inbox, nil, nil) 6833 sent[ca.Name] = struct{}{} 6834 } 6835 } 6836 // Don't hold lock. 6837 js.mu.RUnlock() 6838 6839 const timeout = 4 * time.Second 6840 notActive := time.NewTimer(timeout) 6841 defer notActive.Stop() 6842 6843 LOOP: 6844 for len(sent) > 0 { 6845 select { 6846 case <-s.quitCh: 6847 return 6848 case <-notActive.C: 6849 s.Warnf("Did not receive all consumer info results for '%s > %s'", acc, stream) 6850 for cName := range sent { 6851 missingNames = append(missingNames, cName) 6852 } 6853 break LOOP 6854 case ci := <-rc: 6855 delete(sent, ci.Name) 6856 resp.Consumers = append(resp.Consumers, ci) 6857 // Check to see if we are done. 6858 if len(resp.Consumers) == len(consumers) { 6859 break LOOP 6860 } 6861 } 6862 } 6863 6864 // Needs to be sorted as well. 6865 if len(resp.Consumers) > 1 { 6866 sort.Slice(resp.Consumers, func(i, j int) bool { 6867 return strings.Compare(resp.Consumers[i].Name, resp.Consumers[j].Name) < 0 6868 }) 6869 } 6870 6871 resp.Total = ocnt 6872 resp.Limit = JSApiListLimit 6873 resp.Offset = offset 6874 resp.Missing = missingNames 6875 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6876 } 6877 6878 func encodeStreamPurge(sp *streamPurge) []byte { 6879 var bb bytes.Buffer 6880 bb.WriteByte(byte(purgeStreamOp)) 6881 json.NewEncoder(&bb).Encode(sp) 6882 return bb.Bytes() 6883 } 6884 6885 func decodeStreamPurge(buf []byte) (*streamPurge, error) { 6886 var sp streamPurge 6887 err := json.Unmarshal(buf, &sp) 6888 return &sp, err 6889 } 6890 6891 func (s *Server) jsClusteredConsumerDeleteRequest(ci *ClientInfo, acc *Account, stream, consumer, subject, reply string, rmsg []byte) { 6892 js, cc := s.getJetStreamCluster() 6893 if js == nil || cc == nil { 6894 return 6895 } 6896 6897 js.mu.Lock() 6898 defer js.mu.Unlock() 6899 6900 if cc.meta == nil { 6901 return 6902 } 6903 6904 var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}} 6905 6906 sa := js.streamAssignment(acc.Name, stream) 6907 if sa == nil { 6908 resp.Error = NewJSStreamNotFoundError() 6909 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6910 return 6911 6912 } 6913 if sa.consumers == nil { 6914 resp.Error = NewJSConsumerNotFoundError() 6915 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6916 return 6917 } 6918 oca := sa.consumers[consumer] 6919 if oca == nil { 6920 resp.Error = NewJSConsumerNotFoundError() 6921 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 6922 return 6923 } 6924 oca.deleted = true 6925 ca := &consumerAssignment{Group: oca.Group, Stream: stream, Name: consumer, Config: oca.Config, Subject: subject, Reply: reply, Client: ci} 6926 cc.meta.Propose(encodeDeleteConsumerAssignment(ca)) 6927 } 6928 6929 func encodeMsgDelete(md *streamMsgDelete) []byte { 6930 var bb bytes.Buffer 6931 bb.WriteByte(byte(deleteMsgOp)) 6932 json.NewEncoder(&bb).Encode(md) 6933 return bb.Bytes() 6934 } 6935 6936 func decodeMsgDelete(buf []byte) (*streamMsgDelete, error) { 6937 var md streamMsgDelete 6938 err := json.Unmarshal(buf, &md) 6939 return &md, err 6940 } 6941 6942 func (s *Server) jsClusteredMsgDeleteRequest(ci *ClientInfo, acc *Account, mset *stream, stream, subject, reply string, req *JSApiMsgDeleteRequest, rmsg []byte) { 6943 js, cc := s.getJetStreamCluster() 6944 if js == nil || cc == nil { 6945 return 6946 } 6947 6948 js.mu.Lock() 6949 sa := js.streamAssignment(acc.Name, stream) 6950 if sa == nil { 6951 s.Debugf("Message delete failed, could not locate stream '%s > %s'", acc.Name, stream) 6952 js.mu.Unlock() 6953 return 6954 } 6955 6956 // Check for single replica items. 6957 if n := sa.Group.node; n != nil { 6958 md := streamMsgDelete{Seq: req.Seq, NoErase: req.NoErase, Stream: stream, Subject: subject, Reply: reply, Client: ci} 6959 n.Propose(encodeMsgDelete(&md)) 6960 js.mu.Unlock() 6961 return 6962 } 6963 js.mu.Unlock() 6964 6965 if mset == nil { 6966 return 6967 } 6968 6969 var err error 6970 var removed bool 6971 if req.NoErase { 6972 removed, err = mset.removeMsg(req.Seq) 6973 } else { 6974 removed, err = mset.eraseMsg(req.Seq) 6975 } 6976 var resp = JSApiMsgDeleteResponse{ApiResponse: ApiResponse{Type: JSApiMsgDeleteResponseType}} 6977 if err != nil { 6978 resp.Error = NewJSStreamMsgDeleteFailedError(err, Unless(err)) 6979 } else if !removed { 6980 resp.Error = NewJSSequenceNotFoundError(req.Seq) 6981 } else { 6982 resp.Success = true 6983 } 6984 s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp)) 6985 } 6986 6987 func encodeAddStreamAssignment(sa *streamAssignment) []byte { 6988 var bb bytes.Buffer 6989 bb.WriteByte(byte(assignStreamOp)) 6990 json.NewEncoder(&bb).Encode(sa) 6991 return bb.Bytes() 6992 } 6993 6994 func encodeUpdateStreamAssignment(sa *streamAssignment) []byte { 6995 var bb bytes.Buffer 6996 bb.WriteByte(byte(updateStreamOp)) 6997 json.NewEncoder(&bb).Encode(sa) 6998 return bb.Bytes() 6999 } 7000 7001 func encodeDeleteStreamAssignment(sa *streamAssignment) []byte { 7002 var bb bytes.Buffer 7003 bb.WriteByte(byte(removeStreamOp)) 7004 json.NewEncoder(&bb).Encode(sa) 7005 return bb.Bytes() 7006 } 7007 7008 func decodeStreamAssignment(buf []byte) (*streamAssignment, error) { 7009 var sa streamAssignment 7010 err := json.Unmarshal(buf, &sa) 7011 if err != nil { 7012 return nil, err 7013 } 7014 fixCfgMirrorWithDedupWindow(sa.Config) 7015 return &sa, err 7016 } 7017 7018 func encodeDeleteRange(dr *DeleteRange) []byte { 7019 var bb bytes.Buffer 7020 bb.WriteByte(byte(deleteRangeOp)) 7021 json.NewEncoder(&bb).Encode(dr) 7022 return bb.Bytes() 7023 } 7024 7025 func decodeDeleteRange(buf []byte) (*DeleteRange, error) { 7026 var dr DeleteRange 7027 err := json.Unmarshal(buf, &dr) 7028 if err != nil { 7029 return nil, err 7030 } 7031 return &dr, err 7032 } 7033 7034 // createGroupForConsumer will create a new group from same peer set as the stream. 7035 func (cc *jetStreamCluster) createGroupForConsumer(cfg *ConsumerConfig, sa *streamAssignment) *raftGroup { 7036 if len(sa.Group.Peers) == 0 || cfg.Replicas > len(sa.Group.Peers) { 7037 return nil 7038 } 7039 7040 peers := copyStrings(sa.Group.Peers) 7041 var _ss [5]string 7042 active := _ss[:0] 7043 7044 // Calculate all active peers. 7045 for _, peer := range peers { 7046 if sir, ok := cc.s.nodeToInfo.Load(peer); ok && sir != nil { 7047 if !sir.(nodeInfo).offline { 7048 active = append(active, peer) 7049 } 7050 } 7051 } 7052 if quorum := cfg.Replicas/2 + 1; quorum > len(active) { 7053 // Not enough active to satisfy the request. 7054 return nil 7055 } 7056 7057 // If we want less then our parent stream, select from active. 7058 if cfg.Replicas > 0 && cfg.Replicas < len(peers) { 7059 // Pedantic in case stream is say R5 and consumer is R3 and 3 or more offline, etc. 7060 if len(active) < cfg.Replicas { 7061 return nil 7062 } 7063 // First shuffle the active peers and then select to account for replica = 1. 7064 rand.Shuffle(len(active), func(i, j int) { active[i], active[j] = active[j], active[i] }) 7065 peers = active[:cfg.Replicas] 7066 } 7067 storage := sa.Config.Storage 7068 if cfg.MemoryStorage { 7069 storage = MemoryStorage 7070 } 7071 return &raftGroup{Name: groupNameForConsumer(peers, storage), Storage: storage, Peers: peers} 7072 } 7073 7074 // jsClusteredConsumerRequest is first point of entry to create a consumer in clustered mode. 7075 func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, stream string, cfg *ConsumerConfig, action ConsumerAction) { 7076 js, cc := s.getJetStreamCluster() 7077 if js == nil || cc == nil { 7078 return 7079 } 7080 7081 var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} 7082 7083 streamCfg, ok := js.clusterStreamConfig(acc.Name, stream) 7084 if !ok { 7085 resp.Error = NewJSStreamNotFoundError() 7086 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7087 return 7088 } 7089 selectedLimits, _, _, apiErr := acc.selectLimits(&streamCfg) 7090 if apiErr != nil { 7091 resp.Error = apiErr 7092 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7093 return 7094 } 7095 srvLim := &s.getOpts().JetStreamLimits 7096 // Make sure we have sane defaults 7097 setConsumerConfigDefaults(cfg, &streamCfg, srvLim, selectedLimits) 7098 7099 if err := checkConsumerCfg(cfg, srvLim, &streamCfg, acc, selectedLimits, false); err != nil { 7100 resp.Error = err 7101 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7102 return 7103 } 7104 7105 js.mu.Lock() 7106 defer js.mu.Unlock() 7107 7108 if cc.meta == nil { 7109 return 7110 } 7111 7112 // Lookup the stream assignment. 7113 sa := js.streamAssignment(acc.Name, stream) 7114 if sa == nil { 7115 resp.Error = NewJSStreamNotFoundError() 7116 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7117 return 7118 } 7119 7120 // Check for max consumers here to short circuit if possible. 7121 // Start with limit on a stream, but if one is defined at the level of the account 7122 // and is lower, use that limit. 7123 maxc := sa.Config.MaxConsumers 7124 if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) { 7125 maxc = selectedLimits.MaxConsumers 7126 } 7127 if maxc > 0 { 7128 // Don't count DIRECTS. 7129 total := 0 7130 for _, ca := range sa.consumers { 7131 if ca.Config != nil && !ca.Config.Direct { 7132 total++ 7133 } 7134 } 7135 if total >= maxc { 7136 resp.Error = NewJSMaximumConsumersLimitError() 7137 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7138 return 7139 } 7140 } 7141 7142 // Also short circuit if DeliverLastPerSubject is set with no FilterSubject. 7143 if cfg.DeliverPolicy == DeliverLastPerSubject { 7144 if cfg.FilterSubject == _EMPTY_ && len(cfg.FilterSubjects) == 0 { 7145 resp.Error = NewJSConsumerInvalidPolicyError(fmt.Errorf("consumer delivery policy is deliver last per subject, but FilterSubject is not set")) 7146 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7147 return 7148 } 7149 } 7150 7151 // Setup proper default for ack wait if we are in explicit ack mode. 7152 if cfg.AckWait == 0 && (cfg.AckPolicy == AckExplicit || cfg.AckPolicy == AckAll) { 7153 cfg.AckWait = JsAckWaitDefault 7154 } 7155 // Setup default of -1, meaning no limit for MaxDeliver. 7156 if cfg.MaxDeliver == 0 { 7157 cfg.MaxDeliver = -1 7158 } 7159 // Set proper default for max ack pending if we are ack explicit and none has been set. 7160 if cfg.AckPolicy == AckExplicit && cfg.MaxAckPending == 0 { 7161 cfg.MaxAckPending = JsDefaultMaxAckPending 7162 } 7163 7164 var ca *consumerAssignment 7165 var oname string 7166 7167 // See if we have an existing one already under same durable name or 7168 // if name was set by the user. 7169 if isDurableConsumer(cfg) || cfg.Name != _EMPTY_ { 7170 if cfg.Name != _EMPTY_ { 7171 oname = cfg.Name 7172 } else { 7173 oname = cfg.Durable 7174 } 7175 if ca = sa.consumers[oname]; ca != nil && !ca.deleted { 7176 if action == ActionCreate && !reflect.DeepEqual(cfg, ca.Config) { 7177 resp.Error = NewJSConsumerAlreadyExistsError() 7178 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7179 return 7180 } 7181 // Do quick sanity check on new cfg to prevent here if possible. 7182 if err := acc.checkNewConsumerConfig(ca.Config, cfg); err != nil { 7183 resp.Error = NewJSConsumerCreateError(err, Unless(err)) 7184 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7185 return 7186 } 7187 } 7188 } 7189 7190 // If this is new consumer. 7191 if ca == nil { 7192 if action == ActionUpdate { 7193 resp.Error = NewJSConsumerDoesNotExistError() 7194 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7195 return 7196 } 7197 rg := cc.createGroupForConsumer(cfg, sa) 7198 if rg == nil { 7199 resp.Error = NewJSInsufficientResourcesError() 7200 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7201 return 7202 } 7203 // Pick a preferred leader. 7204 rg.setPreferred() 7205 7206 // Inherit cluster from stream. 7207 rg.Cluster = sa.Group.Cluster 7208 7209 // We need to set the ephemeral here before replicating. 7210 if !isDurableConsumer(cfg) { 7211 // We chose to have ephemerals be R=1 unless stream is interest or workqueue. 7212 // Consumer can override. 7213 if sa.Config.Retention == LimitsPolicy && cfg.Replicas <= 1 { 7214 rg.Peers = []string{rg.Preferred} 7215 rg.Name = groupNameForConsumer(rg.Peers, rg.Storage) 7216 } 7217 if cfg.Name != _EMPTY_ { 7218 oname = cfg.Name 7219 } else { 7220 // Make sure name is unique. 7221 for { 7222 oname = createConsumerName() 7223 if sa.consumers != nil { 7224 if sa.consumers[oname] != nil { 7225 continue 7226 } 7227 } 7228 break 7229 } 7230 } 7231 } 7232 if len(rg.Peers) > 1 { 7233 if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets != 0 { 7234 for _, peer := range rg.Peers { 7235 if ni, ok := s.nodeToInfo.Load(peer); ok { 7236 ni := ni.(nodeInfo) 7237 if stats := ni.stats; stats != nil && stats.HAAssets > maxHaAssets { 7238 resp.Error = NewJSInsufficientResourcesError() 7239 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7240 s.Warnf("%s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d"+ 7241 " for (durable) consumer %s placement on stream %s", 7242 ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets, oname, stream) 7243 return 7244 } 7245 } 7246 } 7247 } 7248 } 7249 7250 // Check if we are work queue policy. 7251 // We will do pre-checks here to avoid thrashing meta layer. 7252 if sa.Config.Retention == WorkQueuePolicy && !cfg.Direct { 7253 if cfg.AckPolicy != AckExplicit { 7254 resp.Error = NewJSConsumerWQRequiresExplicitAckError() 7255 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7256 return 7257 } 7258 subjects := gatherSubjectFilters(cfg.FilterSubject, cfg.FilterSubjects) 7259 if len(subjects) == 0 && len(sa.consumers) > 0 { 7260 resp.Error = NewJSConsumerWQMultipleUnfilteredError() 7261 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7262 return 7263 } 7264 // Check here to make sure we have not collided with another. 7265 if len(sa.consumers) > 0 { 7266 for _, oca := range sa.consumers { 7267 if oca.Name == oname { 7268 continue 7269 } 7270 for _, psubj := range gatherSubjectFilters(oca.Config.FilterSubject, oca.Config.FilterSubjects) { 7271 for _, subj := range subjects { 7272 if SubjectsCollide(subj, psubj) { 7273 resp.Error = NewJSConsumerWQConsumerNotUniqueError() 7274 s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) 7275 return 7276 } 7277 } 7278 } 7279 } 7280 } 7281 } 7282 7283 ca = &consumerAssignment{ 7284 Group: rg, 7285 Stream: stream, 7286 Name: oname, 7287 Config: cfg, 7288 Subject: subject, 7289 Reply: reply, 7290 Client: ci, 7291 Created: time.Now().UTC(), 7292 } 7293 } else { 7294 // If the consumer already exists then don't allow updating the PauseUntil, just set 7295 // it back to whatever the current configured value is. 7296 cfg.PauseUntil = ca.Config.PauseUntil 7297 7298 nca := ca.copyGroup() 7299 7300 rBefore := nca.Config.replicas(sa.Config) 7301 rAfter := cfg.replicas(sa.Config) 7302 7303 var curLeader string 7304 if rBefore != rAfter { 7305 // We are modifying nodes here. We want to do our best to preserve the current leader. 7306 // We have support now from above that guarantees we are in our own Go routine, so can 7307 // ask for stream info from the stream leader to make sure we keep the leader in the new list. 7308 if !s.allPeersOffline(ca.Group) { 7309 // Need to release js lock. 7310 js.mu.Unlock() 7311 if ci, err := sysRequest[ConsumerInfo](s, clusterConsumerInfoT, ci.serviceAccount(), sa.Config.Name, cfg.Durable); err != nil { 7312 s.Warnf("Did not receive consumer info results for '%s > %s > %s' due to: %s", acc, sa.Config.Name, cfg.Durable, err) 7313 } else if ci != nil { 7314 if cl := ci.Cluster; cl != nil { 7315 curLeader = getHash(cl.Leader) 7316 } 7317 } 7318 // Re-acquire here. 7319 js.mu.Lock() 7320 } 7321 } 7322 7323 if rBefore < rAfter { 7324 newPeerSet := nca.Group.Peers 7325 // scale up by adding new members from the stream peer set that are not yet in the consumer peer set 7326 streamPeerSet := copyStrings(sa.Group.Peers) 7327 rand.Shuffle(rAfter, func(i, j int) { streamPeerSet[i], streamPeerSet[j] = streamPeerSet[j], streamPeerSet[i] }) 7328 for _, p := range streamPeerSet { 7329 found := false 7330 for _, sp := range newPeerSet { 7331 if sp == p { 7332 found = true 7333 break 7334 } 7335 } 7336 if !found { 7337 newPeerSet = append(newPeerSet, p) 7338 if len(newPeerSet) == rAfter { 7339 break 7340 } 7341 } 7342 } 7343 nca.Group.Peers = newPeerSet 7344 nca.Group.Preferred = curLeader 7345 } else if rBefore > rAfter { 7346 newPeerSet := nca.Group.Peers 7347 // mark leader preferred and move it to end 7348 nca.Group.Preferred = curLeader 7349 if nca.Group.Preferred != _EMPTY_ { 7350 for i, p := range newPeerSet { 7351 if nca.Group.Preferred == p { 7352 newPeerSet[i] = newPeerSet[len(newPeerSet)-1] 7353 newPeerSet[len(newPeerSet)-1] = p 7354 } 7355 } 7356 } 7357 // scale down by removing peers from the end 7358 newPeerSet = newPeerSet[len(newPeerSet)-rAfter:] 7359 nca.Group.Peers = newPeerSet 7360 } 7361 7362 // Update config and client info on copy of existing. 7363 nca.Config = cfg 7364 nca.Client = ci 7365 nca.Subject = subject 7366 nca.Reply = reply 7367 ca = nca 7368 } 7369 7370 // Mark this as pending. 7371 if sa.consumers == nil { 7372 sa.consumers = make(map[string]*consumerAssignment) 7373 } 7374 sa.consumers[ca.Name] = ca 7375 7376 // Do formal proposal. 7377 cc.meta.Propose(encodeAddConsumerAssignment(ca)) 7378 } 7379 7380 func encodeAddConsumerAssignment(ca *consumerAssignment) []byte { 7381 var bb bytes.Buffer 7382 bb.WriteByte(byte(assignConsumerOp)) 7383 json.NewEncoder(&bb).Encode(ca) 7384 return bb.Bytes() 7385 } 7386 7387 func encodeDeleteConsumerAssignment(ca *consumerAssignment) []byte { 7388 var bb bytes.Buffer 7389 bb.WriteByte(byte(removeConsumerOp)) 7390 json.NewEncoder(&bb).Encode(ca) 7391 return bb.Bytes() 7392 } 7393 7394 func decodeConsumerAssignment(buf []byte) (*consumerAssignment, error) { 7395 var ca consumerAssignment 7396 err := json.Unmarshal(buf, &ca) 7397 return &ca, err 7398 } 7399 7400 func encodeAddConsumerAssignmentCompressed(ca *consumerAssignment) []byte { 7401 b, err := json.Marshal(ca) 7402 if err != nil { 7403 return nil 7404 } 7405 // TODO(dlc) - Streaming better approach here probably. 7406 var bb bytes.Buffer 7407 bb.WriteByte(byte(assignCompressedConsumerOp)) 7408 bb.Write(s2.Encode(nil, b)) 7409 return bb.Bytes() 7410 } 7411 7412 func decodeConsumerAssignmentCompressed(buf []byte) (*consumerAssignment, error) { 7413 var ca consumerAssignment 7414 js, err := s2.Decode(nil, buf) 7415 if err != nil { 7416 return nil, err 7417 } 7418 err = json.Unmarshal(js, &ca) 7419 return &ca, err 7420 } 7421 7422 var errBadStreamMsg = errors.New("jetstream cluster bad replicated stream msg") 7423 7424 func decodeStreamMsg(buf []byte) (subject, reply string, hdr, msg []byte, lseq uint64, ts int64, err error) { 7425 var le = binary.LittleEndian 7426 if len(buf) < 26 { 7427 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7428 } 7429 lseq = le.Uint64(buf) 7430 buf = buf[8:] 7431 ts = int64(le.Uint64(buf)) 7432 buf = buf[8:] 7433 sl := int(le.Uint16(buf)) 7434 buf = buf[2:] 7435 if len(buf) < sl { 7436 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7437 } 7438 subject = string(buf[:sl]) 7439 buf = buf[sl:] 7440 if len(buf) < 2 { 7441 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7442 } 7443 rl := int(le.Uint16(buf)) 7444 buf = buf[2:] 7445 if len(buf) < rl { 7446 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7447 } 7448 reply = string(buf[:rl]) 7449 buf = buf[rl:] 7450 if len(buf) < 2 { 7451 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7452 } 7453 hl := int(le.Uint16(buf)) 7454 buf = buf[2:] 7455 if len(buf) < hl { 7456 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7457 } 7458 if hdr = buf[:hl]; len(hdr) == 0 { 7459 hdr = nil 7460 } 7461 buf = buf[hl:] 7462 if len(buf) < 4 { 7463 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7464 } 7465 ml := int(le.Uint32(buf)) 7466 buf = buf[4:] 7467 if len(buf) < ml { 7468 return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg 7469 } 7470 if msg = buf[:ml]; len(msg) == 0 { 7471 msg = nil 7472 } 7473 return subject, reply, hdr, msg, lseq, ts, nil 7474 } 7475 7476 // Helper to return if compression allowed. 7477 func (mset *stream) compressAllowed() bool { 7478 mset.clMu.Lock() 7479 defer mset.clMu.Unlock() 7480 return mset.compressOK 7481 } 7482 7483 func encodeStreamMsg(subject, reply string, hdr, msg []byte, lseq uint64, ts int64) []byte { 7484 return encodeStreamMsgAllowCompress(subject, reply, hdr, msg, lseq, ts, false) 7485 } 7486 7487 // Threshold for compression. 7488 // TODO(dlc) - Eventually make configurable. 7489 const compressThreshold = 256 7490 7491 // If allowed and contents over the threshold we will compress. 7492 func encodeStreamMsgAllowCompress(subject, reply string, hdr, msg []byte, lseq uint64, ts int64, compressOK bool) []byte { 7493 shouldCompress := compressOK && len(subject)+len(reply)+len(hdr)+len(msg) > compressThreshold 7494 7495 elen := 1 + 8 + 8 + len(subject) + len(reply) + len(hdr) + len(msg) 7496 elen += (2 + 2 + 2 + 4) // Encoded lengths, 4bytes 7497 // TODO(dlc) - check sizes of subject, reply and hdr, make sure uint16 ok. 7498 buf := make([]byte, elen) 7499 buf[0] = byte(streamMsgOp) 7500 var le = binary.LittleEndian 7501 wi := 1 7502 le.PutUint64(buf[wi:], lseq) 7503 wi += 8 7504 le.PutUint64(buf[wi:], uint64(ts)) 7505 wi += 8 7506 le.PutUint16(buf[wi:], uint16(len(subject))) 7507 wi += 2 7508 copy(buf[wi:], subject) 7509 wi += len(subject) 7510 le.PutUint16(buf[wi:], uint16(len(reply))) 7511 wi += 2 7512 copy(buf[wi:], reply) 7513 wi += len(reply) 7514 le.PutUint16(buf[wi:], uint16(len(hdr))) 7515 wi += 2 7516 if len(hdr) > 0 { 7517 copy(buf[wi:], hdr) 7518 wi += len(hdr) 7519 } 7520 le.PutUint32(buf[wi:], uint32(len(msg))) 7521 wi += 4 7522 if len(msg) > 0 { 7523 copy(buf[wi:], msg) 7524 wi += len(msg) 7525 } 7526 7527 // Check if we should compress. 7528 if shouldCompress { 7529 nbuf := make([]byte, s2.MaxEncodedLen(elen)) 7530 nbuf[0] = byte(compressedStreamMsgOp) 7531 ebuf := s2.Encode(nbuf[1:], buf[1:wi]) 7532 // Only pay cost of decode the other side if we compressed. 7533 // S2 will allow us to try without major penalty for non-compressable data. 7534 if len(ebuf) < wi { 7535 nbuf = nbuf[:len(ebuf)+1] 7536 buf, wi = nbuf, len(nbuf) 7537 } 7538 } 7539 7540 return buf[:wi] 7541 } 7542 7543 // Determine if all peers in our set support the binary snapshot. 7544 func (mset *stream) supportsBinarySnapshot() bool { 7545 mset.mu.RLock() 7546 defer mset.mu.RUnlock() 7547 return mset.supportsBinarySnapshotLocked() 7548 } 7549 7550 // Determine if all peers in our set support the binary snapshot. 7551 // Lock should be held. 7552 func (mset *stream) supportsBinarySnapshotLocked() bool { 7553 s, n := mset.srv, mset.node 7554 if s == nil || n == nil { 7555 return false 7556 } 7557 // Grab our peers and walk them to make sure we can all support binary stream snapshots. 7558 id, peers := n.ID(), n.Peers() 7559 for _, p := range peers { 7560 if p.ID == id { 7561 // We know we support ourselves. 7562 continue 7563 } 7564 if sir, ok := s.nodeToInfo.Load(p.ID); !ok || sir == nil || !sir.(nodeInfo).binarySnapshots { 7565 return false 7566 } 7567 } 7568 return true 7569 } 7570 7571 // StreamSnapshot is used for snapshotting and out of band catch up in clustered mode. 7572 // Legacy, replace with binary stream snapshots. 7573 type streamSnapshot struct { 7574 Msgs uint64 `json:"messages"` 7575 Bytes uint64 `json:"bytes"` 7576 FirstSeq uint64 `json:"first_seq"` 7577 LastSeq uint64 `json:"last_seq"` 7578 Failed uint64 `json:"clfs"` 7579 Deleted []uint64 `json:"deleted,omitempty"` 7580 } 7581 7582 // Grab a snapshot of a stream for clustered mode. 7583 func (mset *stream) stateSnapshot() []byte { 7584 mset.mu.RLock() 7585 defer mset.mu.RUnlock() 7586 return mset.stateSnapshotLocked() 7587 } 7588 7589 // Grab a snapshot of a stream for clustered mode. 7590 // Lock should be held. 7591 func (mset *stream) stateSnapshotLocked() []byte { 7592 // Decide if we can support the new style of stream snapshots. 7593 if mset.supportsBinarySnapshotLocked() { 7594 snap, _ := mset.store.EncodedStreamState(mset.getCLFS()) 7595 return snap 7596 } 7597 7598 // Older v1 version with deleted as a sorted []uint64. 7599 state := mset.store.State() 7600 snap := &streamSnapshot{ 7601 Msgs: state.Msgs, 7602 Bytes: state.Bytes, 7603 FirstSeq: state.FirstSeq, 7604 LastSeq: state.LastSeq, 7605 Failed: mset.getCLFS(), 7606 Deleted: state.Deleted, 7607 } 7608 b, _ := json.Marshal(snap) 7609 return b 7610 } 7611 7612 // Will check if we can do message compression in RAFT and catchup logic. 7613 func (mset *stream) checkAllowMsgCompress(peers []string) { 7614 allowed := true 7615 for _, id := range peers { 7616 sir, ok := mset.srv.nodeToInfo.Load(id) 7617 if !ok || sir == nil { 7618 allowed = false 7619 break 7620 } 7621 // Check for capability. 7622 if si := sir.(nodeInfo); si.cfg == nil || !si.cfg.CompressOK { 7623 allowed = false 7624 break 7625 } 7626 } 7627 mset.mu.Lock() 7628 mset.compressOK = allowed 7629 mset.mu.Unlock() 7630 } 7631 7632 // To warn when we are getting too far behind from what has been proposed vs what has been committed. 7633 const streamLagWarnThreshold = 10_000 7634 7635 // processClusteredInboundMsg will propose the inbound message to the underlying raft group. 7636 func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg []byte, mt *msgTrace) (retErr error) { 7637 // For possible error response. 7638 var response []byte 7639 7640 mset.mu.RLock() 7641 canRespond := !mset.cfg.NoAck && len(reply) > 0 7642 name, stype, store := mset.cfg.Name, mset.cfg.Storage, mset.store 7643 s, js, jsa, st, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node 7644 maxMsgSize, lseq := int(mset.cfg.MaxMsgSize), mset.lseq 7645 interestPolicy, discard, maxMsgs, maxBytes := mset.cfg.Retention != LimitsPolicy, mset.cfg.Discard, mset.cfg.MaxMsgs, mset.cfg.MaxBytes 7646 isLeader, isSealed := mset.isLeader(), mset.cfg.Sealed 7647 mset.mu.RUnlock() 7648 7649 // This should not happen but possible now that we allow scale up, and scale down where this could trigger. 7650 // 7651 // We also invoke this in clustering mode for message tracing when not 7652 // performing message delivery. 7653 if node == nil || mt.traceOnly() { 7654 return mset.processJetStreamMsg(subject, reply, hdr, msg, 0, 0, mt) 7655 } 7656 7657 // If message tracing (with message delivery), we will need to send the 7658 // event on exit in case there was an error (if message was not proposed). 7659 // Otherwise, the event will be sent from processJetStreamMsg when 7660 // invoked by the leader (from applyStreamEntries). 7661 if mt != nil { 7662 defer func() { 7663 if retErr != nil { 7664 mt.sendEventFromJetStream(retErr) 7665 } 7666 }() 7667 } 7668 7669 // Check that we are the leader. This can be false if we have scaled up from an R1 that had inbound queued messages. 7670 if !isLeader { 7671 return NewJSClusterNotLeaderError() 7672 } 7673 7674 // Bail here if sealed. 7675 if isSealed { 7676 var resp = JSPubAckResponse{PubAck: &PubAck{Stream: mset.name()}, Error: NewJSStreamSealedError()} 7677 b, _ := json.Marshal(resp) 7678 mset.outq.sendMsg(reply, b) 7679 return NewJSStreamSealedError() 7680 } 7681 7682 // Check here pre-emptively if we have exceeded this server limits. 7683 if js.limitsExceeded(stype) { 7684 s.resourcesExceededError() 7685 if canRespond { 7686 b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: NewJSInsufficientResourcesError()}) 7687 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0)) 7688 } 7689 // Stepdown regardless. 7690 if node := mset.raftNode(); node != nil { 7691 node.StepDown() 7692 } 7693 return NewJSInsufficientResourcesError() 7694 } 7695 7696 // Check here pre-emptively if we have exceeded our account limits. 7697 if exceeded, err := jsa.wouldExceedLimits(st, tierName, r, subject, hdr, msg); exceeded { 7698 if err == nil { 7699 err = NewJSAccountResourcesExceededError() 7700 } 7701 s.RateLimitWarnf(err.Error()) 7702 if canRespond { 7703 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7704 resp.Error = err 7705 response, _ = json.Marshal(resp) 7706 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7707 } 7708 return err 7709 } 7710 7711 // Check msgSize if we have a limit set there. Again this works if it goes through but better to be pre-emptive. 7712 if maxMsgSize >= 0 && (len(hdr)+len(msg)) > maxMsgSize { 7713 err := fmt.Errorf("JetStream message size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) 7714 s.RateLimitWarnf(err.Error()) 7715 if canRespond { 7716 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7717 resp.Error = NewJSStreamMessageExceedsMaximumError() 7718 response, _ = json.Marshal(resp) 7719 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7720 } 7721 return err 7722 } 7723 7724 // Some header checks can be checked pre proposal. Most can not. 7725 var msgId string 7726 if len(hdr) > 0 { 7727 // Since we encode header len as u16 make sure we do not exceed. 7728 // Again this works if it goes through but better to be pre-emptive. 7729 if len(hdr) > math.MaxUint16 { 7730 err := fmt.Errorf("JetStream header size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) 7731 s.RateLimitWarnf(err.Error()) 7732 if canRespond { 7733 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7734 resp.Error = NewJSStreamHeaderExceedsMaximumError() 7735 response, _ = json.Marshal(resp) 7736 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7737 } 7738 return err 7739 } 7740 // Expected last sequence per subject. 7741 // We can check for last sequence per subject but only if the expected seq <= lseq. 7742 if seq, exists := getExpectedLastSeqPerSubject(hdr); exists && store != nil && seq > 0 && seq <= lseq { 7743 var smv StoreMsg 7744 var fseq uint64 7745 sm, err := store.LoadLastMsg(subject, &smv) 7746 if sm != nil { 7747 fseq = sm.seq 7748 } 7749 if err != nil || fseq != seq { 7750 if canRespond { 7751 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7752 resp.PubAck = &PubAck{Stream: name} 7753 resp.Error = NewJSStreamWrongLastSequenceError(fseq) 7754 b, _ := json.Marshal(resp) 7755 outq.sendMsg(reply, b) 7756 } 7757 return fmt.Errorf("last sequence by subject mismatch: %d vs %d", seq, fseq) 7758 } 7759 } 7760 // Expected stream name can also be pre-checked. 7761 if sname := getExpectedStream(hdr); sname != _EMPTY_ && sname != name { 7762 if canRespond { 7763 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7764 resp.PubAck = &PubAck{Stream: name} 7765 resp.Error = NewJSStreamNotMatchError() 7766 b, _ := json.Marshal(resp) 7767 outq.sendMsg(reply, b) 7768 } 7769 return errStreamMismatch 7770 } 7771 // Check for MsgIds here at the cluster level to avoid excessive CLFS accounting. 7772 // Will help during restarts. 7773 if msgId = getMsgId(hdr); msgId != _EMPTY_ { 7774 mset.mu.Lock() 7775 if dde := mset.checkMsgId(msgId); dde != nil { 7776 var buf [256]byte 7777 pubAck := append(buf[:0], mset.pubAck...) 7778 seq := dde.seq 7779 mset.mu.Unlock() 7780 if canRespond { 7781 response := append(pubAck, strconv.FormatUint(seq, 10)...) 7782 response = append(response, ",\"duplicate\": true}"...) 7783 outq.sendMsg(reply, response) 7784 } 7785 return errMsgIdDuplicate 7786 } 7787 // FIXME(dlc) - locking conflict with accessing mset.clseq 7788 // For now we stage with zero, and will update in processStreamMsg. 7789 mset.storeMsgIdLocked(&ddentry{msgId, 0, time.Now().UnixNano()}) 7790 mset.mu.Unlock() 7791 } 7792 } 7793 7794 // Proceed with proposing this message. 7795 7796 // We only use mset.clseq for clustering and in case we run ahead of actual commits. 7797 // Check if we need to set initial value here 7798 mset.clMu.Lock() 7799 if mset.clseq == 0 || mset.clseq < lseq+mset.clfs { 7800 // Re-capture 7801 lseq = mset.lastSeq() 7802 mset.clseq = lseq + mset.clfs 7803 } 7804 7805 // Check if we have an interest policy and discard new with max msgs or bytes. 7806 // We need to deny here otherwise it could succeed on some peers and not others 7807 // depending on consumer ack state. So we deny here, if we allow that means we know 7808 // it would succeed on every peer. 7809 if interestPolicy && discard == DiscardNew && (maxMsgs > 0 || maxBytes > 0) { 7810 // Track inflight. 7811 if mset.inflight == nil { 7812 mset.inflight = make(map[uint64]uint64) 7813 } 7814 if stype == FileStorage { 7815 mset.inflight[mset.clseq] = fileStoreMsgSize(subject, hdr, msg) 7816 } else { 7817 mset.inflight[mset.clseq] = memStoreMsgSize(subject, hdr, msg) 7818 } 7819 7820 var state StreamState 7821 mset.store.FastState(&state) 7822 7823 var err error 7824 if maxMsgs > 0 && state.Msgs+uint64(len(mset.inflight)) > uint64(maxMsgs) { 7825 err = ErrMaxMsgs 7826 } else if maxBytes > 0 { 7827 // TODO(dlc) - Could track this rollup independently. 7828 var bytesPending uint64 7829 for _, nb := range mset.inflight { 7830 bytesPending += nb 7831 } 7832 if state.Bytes+bytesPending > uint64(maxBytes) { 7833 err = ErrMaxBytes 7834 } 7835 } 7836 if err != nil { 7837 delete(mset.inflight, mset.clseq) 7838 mset.clMu.Unlock() 7839 if canRespond { 7840 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} 7841 resp.Error = NewJSStreamStoreFailedError(err, Unless(err)) 7842 response, _ = json.Marshal(resp) 7843 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7844 } 7845 return err 7846 } 7847 } 7848 7849 esm := encodeStreamMsgAllowCompress(subject, reply, hdr, msg, mset.clseq, time.Now().UnixNano(), mset.compressOK) 7850 var mtKey uint64 7851 if mt != nil { 7852 mtKey = mset.clseq 7853 if mset.mt == nil { 7854 mset.mt = make(map[uint64]*msgTrace) 7855 } 7856 mset.mt[mtKey] = mt 7857 } 7858 7859 // Do proposal. 7860 err := node.Propose(esm) 7861 if err == nil { 7862 mset.clseq++ 7863 } 7864 7865 // Check to see if we are being overrun. 7866 // TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured. 7867 if mset.clseq-(lseq+mset.clfs) > streamLagWarnThreshold { 7868 lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name) 7869 s.RateLimitWarnf(lerr.Error()) 7870 } 7871 mset.clMu.Unlock() 7872 7873 if err != nil { 7874 if mt != nil { 7875 mset.getAndDeleteMsgTrace(mtKey) 7876 } 7877 if canRespond { 7878 var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: mset.cfg.Name}} 7879 resp.Error = &ApiError{Code: 503, Description: err.Error()} 7880 response, _ = json.Marshal(resp) 7881 // If we errored out respond here. 7882 outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) 7883 } 7884 if isOutOfSpaceErr(err) { 7885 s.handleOutOfSpace(mset) 7886 } 7887 } 7888 7889 return err 7890 } 7891 7892 func (mset *stream) getAndDeleteMsgTrace(lseq uint64) *msgTrace { 7893 if mset == nil { 7894 return nil 7895 } 7896 mset.clMu.Lock() 7897 mt, ok := mset.mt[lseq] 7898 if ok { 7899 delete(mset.mt, lseq) 7900 } 7901 mset.clMu.Unlock() 7902 return mt 7903 } 7904 7905 // For requesting messages post raft snapshot to catch up streams post server restart. 7906 // Any deleted msgs etc will be handled inline on catchup. 7907 type streamSyncRequest struct { 7908 Peer string `json:"peer,omitempty"` 7909 FirstSeq uint64 `json:"first_seq"` 7910 LastSeq uint64 `json:"last_seq"` 7911 DeleteRangesOk bool `json:"delete_ranges"` 7912 } 7913 7914 // Given a stream state that represents a snapshot, calculate the sync request based on our current state. 7915 // Stream lock must be held. 7916 func (mset *stream) calculateSyncRequest(state *StreamState, snap *StreamReplicatedState) *streamSyncRequest { 7917 // Shouldn't happen, but consequences are pretty bad if we have the lock held and 7918 // our caller tries to take the lock again on panic defer, as in processSnapshot. 7919 if state == nil || snap == nil || mset.node == nil { 7920 return nil 7921 } 7922 // Quick check if we are already caught up. 7923 if state.LastSeq >= snap.LastSeq { 7924 return nil 7925 } 7926 return &streamSyncRequest{FirstSeq: state.LastSeq + 1, LastSeq: snap.LastSeq, Peer: mset.node.ID(), DeleteRangesOk: true} 7927 } 7928 7929 // processSnapshotDeletes will update our current store based on the snapshot 7930 // but only processing deletes and new FirstSeq / purges. 7931 func (mset *stream) processSnapshotDeletes(snap *StreamReplicatedState) { 7932 mset.mu.Lock() 7933 var state StreamState 7934 mset.store.FastState(&state) 7935 // Always adjust if FirstSeq has moved beyond our state. 7936 var didReset bool 7937 if snap.FirstSeq > state.FirstSeq { 7938 mset.store.Compact(snap.FirstSeq) 7939 mset.store.FastState(&state) 7940 mset.lseq = state.LastSeq 7941 mset.clearAllPreAcksBelowFloor(state.FirstSeq) 7942 didReset = true 7943 } 7944 s := mset.srv 7945 mset.mu.Unlock() 7946 7947 if didReset { 7948 s.Warnf("Catchup for stream '%s > %s' resetting first sequence: %d on catchup request", 7949 mset.account(), mset.name(), snap.FirstSeq) 7950 } 7951 7952 if len(snap.Deleted) > 0 { 7953 mset.store.SyncDeleted(snap.Deleted) 7954 } 7955 } 7956 7957 func (mset *stream) setCatchupPeer(peer string, lag uint64) { 7958 if peer == _EMPTY_ { 7959 return 7960 } 7961 mset.mu.Lock() 7962 if mset.catchups == nil { 7963 mset.catchups = make(map[string]uint64) 7964 } 7965 mset.catchups[peer] = lag 7966 mset.mu.Unlock() 7967 } 7968 7969 // Will decrement by one. 7970 func (mset *stream) updateCatchupPeer(peer string) { 7971 if peer == _EMPTY_ { 7972 return 7973 } 7974 mset.mu.Lock() 7975 if lag := mset.catchups[peer]; lag > 0 { 7976 mset.catchups[peer] = lag - 1 7977 } 7978 mset.mu.Unlock() 7979 } 7980 7981 func (mset *stream) decrementCatchupPeer(peer string, num uint64) { 7982 if peer == _EMPTY_ { 7983 return 7984 } 7985 mset.mu.Lock() 7986 if lag := mset.catchups[peer]; lag > 0 { 7987 if lag >= num { 7988 lag -= num 7989 } else { 7990 lag = 0 7991 } 7992 mset.catchups[peer] = lag 7993 } 7994 mset.mu.Unlock() 7995 } 7996 7997 func (mset *stream) clearCatchupPeer(peer string) { 7998 mset.mu.Lock() 7999 if mset.catchups != nil { 8000 delete(mset.catchups, peer) 8001 } 8002 mset.mu.Unlock() 8003 } 8004 8005 // Lock should be held. 8006 func (mset *stream) clearAllCatchupPeers() { 8007 if mset.catchups != nil { 8008 mset.catchups = nil 8009 } 8010 } 8011 8012 func (mset *stream) lagForCatchupPeer(peer string) uint64 { 8013 mset.mu.RLock() 8014 defer mset.mu.RUnlock() 8015 if mset.catchups == nil { 8016 return 0 8017 } 8018 return mset.catchups[peer] 8019 } 8020 8021 func (mset *stream) hasCatchupPeers() bool { 8022 mset.mu.RLock() 8023 defer mset.mu.RUnlock() 8024 return len(mset.catchups) > 0 8025 } 8026 8027 func (mset *stream) setCatchingUp() { 8028 mset.catchup.Store(true) 8029 } 8030 8031 func (mset *stream) clearCatchingUp() { 8032 mset.catchup.Store(false) 8033 } 8034 8035 func (mset *stream) isCatchingUp() bool { 8036 return mset.catchup.Load() 8037 } 8038 8039 // Determine if a non-leader is current. 8040 // Lock should be held. 8041 func (mset *stream) isCurrent() bool { 8042 if mset.node == nil { 8043 return true 8044 } 8045 return mset.node.Current() && !mset.catchup.Load() 8046 } 8047 8048 // Maximum requests for the whole server that can be in flight at the same time. 8049 const maxConcurrentSyncRequests = 16 8050 8051 var ( 8052 errCatchupCorruptSnapshot = errors.New("corrupt stream snapshot detected") 8053 errCatchupStalled = errors.New("catchup stalled") 8054 errCatchupStreamStopped = errors.New("stream has been stopped") // when a catchup is terminated due to the stream going away. 8055 errCatchupBadMsg = errors.New("bad catchup msg") 8056 errCatchupWrongSeqForSkip = errors.New("wrong sequence for skipped msg") 8057 ) 8058 8059 // Process a stream snapshot. 8060 func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) { 8061 // Update any deletes, etc. 8062 mset.processSnapshotDeletes(snap) 8063 mset.setCLFS(snap.Failed) 8064 8065 mset.mu.Lock() 8066 var state StreamState 8067 mset.store.FastState(&state) 8068 sreq := mset.calculateSyncRequest(&state, snap) 8069 8070 s, js, subject, n, st := mset.srv, mset.js, mset.sa.Sync, mset.node, mset.cfg.Storage 8071 qname := fmt.Sprintf("[ACC:%s] stream '%s' snapshot", mset.acc.Name, mset.cfg.Name) 8072 mset.mu.Unlock() 8073 8074 // Bug that would cause this to be empty on stream update. 8075 if subject == _EMPTY_ { 8076 return errCatchupCorruptSnapshot 8077 } 8078 8079 // Just return if up to date or already exceeded limits. 8080 if sreq == nil || js.limitsExceeded(st) { 8081 return nil 8082 } 8083 8084 // Pause the apply channel for our raft group while we catch up. 8085 if err := n.PauseApply(); err != nil { 8086 return err 8087 } 8088 8089 defer func() { 8090 // Don't bother resuming if server or stream is gone. 8091 if e != errCatchupStreamStopped && e != ErrServerNotRunning { 8092 n.ResumeApply() 8093 } 8094 }() 8095 8096 // Set our catchup state. 8097 mset.setCatchingUp() 8098 defer mset.clearCatchingUp() 8099 8100 var sub *subscription 8101 var err error 8102 8103 const activityInterval = 30 * time.Second 8104 notActive := time.NewTimer(activityInterval) 8105 defer notActive.Stop() 8106 8107 defer func() { 8108 if sub != nil { 8109 s.sysUnsubscribe(sub) 8110 } 8111 // Make sure any consumers are updated for the pending amounts. 8112 mset.mu.Lock() 8113 for _, o := range mset.consumers { 8114 o.mu.Lock() 8115 if o.isLeader() { 8116 o.streamNumPending() 8117 } 8118 o.mu.Unlock() 8119 } 8120 mset.mu.Unlock() 8121 }() 8122 8123 var releaseSem bool 8124 releaseSyncOutSem := func() { 8125 if !releaseSem { 8126 return 8127 } 8128 // Need to use select for the server shutdown case. 8129 select { 8130 case s.syncOutSem <- struct{}{}: 8131 default: 8132 } 8133 releaseSem = false 8134 } 8135 // On exit, we will release our semaphore if we acquired it. 8136 defer releaseSyncOutSem() 8137 8138 // Do not let this go on forever. 8139 const maxRetries = 3 8140 var numRetries int 8141 8142 RETRY: 8143 // On retry, we need to release the semaphore we got. Call will be no-op 8144 // if releaseSem boolean has not been set to true on successfully getting 8145 // the semaphore. 8146 releaseSyncOutSem() 8147 8148 if n.GroupLeader() == _EMPTY_ { 8149 return fmt.Errorf("catchup for stream '%s > %s' aborted, no leader", mset.account(), mset.name()) 8150 } 8151 8152 // If we have a sub clear that here. 8153 if sub != nil { 8154 s.sysUnsubscribe(sub) 8155 sub = nil 8156 } 8157 8158 if !s.isRunning() { 8159 return ErrServerNotRunning 8160 } 8161 8162 numRetries++ 8163 if numRetries >= maxRetries { 8164 // Force a hard reset here. 8165 return errFirstSequenceMismatch 8166 } 8167 8168 // Block here if we have too many requests in flight. 8169 <-s.syncOutSem 8170 releaseSem = true 8171 8172 // We may have been blocked for a bit, so the reset needs to ensure that we 8173 // consume the already fired timer. 8174 if !notActive.Stop() { 8175 select { 8176 case <-notActive.C: 8177 default: 8178 } 8179 } 8180 notActive.Reset(activityInterval) 8181 8182 // Grab sync request again on failures. 8183 if sreq == nil { 8184 mset.mu.RLock() 8185 var state StreamState 8186 mset.store.FastState(&state) 8187 sreq = mset.calculateSyncRequest(&state, snap) 8188 mset.mu.RUnlock() 8189 if sreq == nil { 8190 return nil 8191 } 8192 } 8193 8194 // Used to transfer message from the wire to another Go routine internally. 8195 type im struct { 8196 msg []byte 8197 reply string 8198 } 8199 // This is used to notify the leader that it should stop the runCatchup 8200 // because we are either bailing out or going to retry due to an error. 8201 notifyLeaderStopCatchup := func(mrec *im, err error) { 8202 if mrec.reply == _EMPTY_ { 8203 return 8204 } 8205 s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, err.Error()) 8206 } 8207 8208 msgsQ := newIPQueue[*im](s, qname) 8209 defer msgsQ.unregister() 8210 8211 // Send our catchup request here. 8212 reply := syncReplySubject() 8213 sub, err = s.sysSubscribe(reply, func(_ *subscription, _ *client, _ *Account, _, reply string, msg []byte) { 8214 // Make copy since we are using a buffer from the inbound client/route. 8215 msgsQ.push(&im{copyBytes(msg), reply}) 8216 }) 8217 if err != nil { 8218 s.Errorf("Could not subscribe to stream catchup: %v", err) 8219 goto RETRY 8220 } 8221 8222 // Send our sync request. 8223 b, _ := json.Marshal(sreq) 8224 s.sendInternalMsgLocked(subject, reply, nil, b) 8225 // Remember when we sent this out to avoid loop spins on errors below. 8226 reqSendTime := time.Now() 8227 // Clear our sync request. 8228 sreq = nil 8229 8230 // Run our own select loop here. 8231 for qch, lch := n.QuitC(), n.LeadChangeC(); ; { 8232 select { 8233 case <-msgsQ.ch: 8234 notActive.Reset(activityInterval) 8235 8236 mrecs := msgsQ.pop() 8237 for _, mrec := range mrecs { 8238 msg := mrec.msg 8239 // Check for eof signaling. 8240 if len(msg) == 0 { 8241 msgsQ.recycle(&mrecs) 8242 mset.checkInterestState() 8243 return nil 8244 } 8245 if _, err := mset.processCatchupMsg(msg); err == nil { 8246 if mrec.reply != _EMPTY_ { 8247 s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, nil) 8248 } 8249 } else if isOutOfSpaceErr(err) { 8250 notifyLeaderStopCatchup(mrec, err) 8251 return err 8252 } else if err == NewJSInsufficientResourcesError() { 8253 notifyLeaderStopCatchup(mrec, err) 8254 if mset.js.limitsExceeded(mset.cfg.Storage) { 8255 s.resourcesExceededError() 8256 } else { 8257 s.Warnf("Catchup for stream '%s > %s' errored, account resources exceeded: %v", mset.account(), mset.name(), err) 8258 } 8259 msgsQ.recycle(&mrecs) 8260 return err 8261 } else { 8262 notifyLeaderStopCatchup(mrec, err) 8263 s.Warnf("Catchup for stream '%s > %s' errored, will retry: %v", mset.account(), mset.name(), err) 8264 msgsQ.recycle(&mrecs) 8265 8266 // Make sure we do not spin and make things worse. 8267 const minRetryWait = 2 * time.Second 8268 elapsed := time.Since(reqSendTime) 8269 if elapsed < minRetryWait { 8270 select { 8271 case <-s.quitCh: 8272 return ErrServerNotRunning 8273 case <-qch: 8274 return errCatchupStreamStopped 8275 case <-time.After(minRetryWait - elapsed): 8276 } 8277 } 8278 goto RETRY 8279 } 8280 } 8281 notActive.Reset(activityInterval) 8282 msgsQ.recycle(&mrecs) 8283 case <-notActive.C: 8284 if mrecs := msgsQ.pop(); len(mrecs) > 0 { 8285 mrec := mrecs[0] 8286 notifyLeaderStopCatchup(mrec, errCatchupStalled) 8287 msgsQ.recycle(&mrecs) 8288 } 8289 s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name()) 8290 goto RETRY 8291 case <-s.quitCh: 8292 return ErrServerNotRunning 8293 case <-qch: 8294 return errCatchupStreamStopped 8295 case isLeader := <-lch: 8296 if isLeader { 8297 n.StepDown() 8298 goto RETRY 8299 } 8300 } 8301 } 8302 } 8303 8304 // processCatchupMsg will be called to process out of band catchup msgs from a sync request. 8305 func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { 8306 if len(msg) == 0 { 8307 return 0, errCatchupBadMsg 8308 } 8309 op := entryOp(msg[0]) 8310 if op != streamMsgOp && op != compressedStreamMsgOp && op != deleteRangeOp { 8311 return 0, errCatchupBadMsg 8312 } 8313 8314 mbuf := msg[1:] 8315 if op == deleteRangeOp { 8316 dr, err := decodeDeleteRange(mbuf) 8317 if err != nil { 8318 return 0, errCatchupBadMsg 8319 } 8320 // Handle the delete range. 8321 // Make sure the sequences match up properly. 8322 mset.mu.Lock() 8323 if len(mset.preAcks) > 0 { 8324 for seq := dr.First; seq < dr.First+dr.Num; seq++ { 8325 mset.clearAllPreAcks(seq) 8326 } 8327 } 8328 if err = mset.store.SkipMsgs(dr.First, dr.Num); err != nil { 8329 mset.mu.Unlock() 8330 return 0, errCatchupWrongSeqForSkip 8331 } 8332 mset.lseq = dr.First + dr.Num - 1 8333 lseq := mset.lseq 8334 mset.mu.Unlock() 8335 return lseq, nil 8336 } 8337 8338 if op == compressedStreamMsgOp { 8339 var err error 8340 mbuf, err = s2.Decode(nil, mbuf) 8341 if err != nil { 8342 panic(err.Error()) 8343 } 8344 } 8345 8346 subj, _, hdr, msg, seq, ts, err := decodeStreamMsg(mbuf) 8347 if err != nil { 8348 return 0, errCatchupBadMsg 8349 } 8350 8351 mset.mu.Lock() 8352 st := mset.cfg.Storage 8353 ddloaded := mset.ddloaded 8354 tierName := mset.tier 8355 replicas := mset.cfg.Replicas 8356 8357 if mset.hasAllPreAcks(seq, subj) { 8358 mset.clearAllPreAcks(seq) 8359 // Mark this to be skipped 8360 subj, ts = _EMPTY_, 0 8361 } 8362 mset.mu.Unlock() 8363 8364 if mset.js.limitsExceeded(st) { 8365 return 0, NewJSInsufficientResourcesError() 8366 } else if exceeded, apiErr := mset.jsa.limitsExceeded(st, tierName, replicas); apiErr != nil { 8367 return 0, apiErr 8368 } else if exceeded { 8369 return 0, NewJSInsufficientResourcesError() 8370 } 8371 8372 // Put into our store 8373 // Messages to be skipped have no subject or timestamp. 8374 // TODO(dlc) - formalize with skipMsgOp 8375 if subj == _EMPTY_ && ts == 0 { 8376 if lseq := mset.store.SkipMsg(); lseq != seq { 8377 return 0, errCatchupWrongSeqForSkip 8378 } 8379 } else if err := mset.store.StoreRawMsg(subj, hdr, msg, seq, ts); err != nil { 8380 return 0, err 8381 } 8382 8383 // Update our lseq. 8384 mset.setLastSeq(seq) 8385 8386 // Check for MsgId and if we have one here make sure to update our internal map. 8387 if len(hdr) > 0 { 8388 if msgId := getMsgId(hdr); msgId != _EMPTY_ { 8389 if !ddloaded { 8390 mset.mu.Lock() 8391 mset.rebuildDedupe() 8392 mset.mu.Unlock() 8393 } 8394 mset.storeMsgId(&ddentry{msgId, seq, ts}) 8395 } 8396 } 8397 8398 return seq, nil 8399 } 8400 8401 func (mset *stream) handleClusterSyncRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 8402 var sreq streamSyncRequest 8403 if err := json.Unmarshal(msg, &sreq); err != nil { 8404 // Log error. 8405 return 8406 } 8407 mset.srv.startGoRoutine(func() { mset.runCatchup(reply, &sreq) }) 8408 } 8409 8410 // Lock should be held. 8411 func (js *jetStream) offlineClusterInfo(rg *raftGroup) *ClusterInfo { 8412 s := js.srv 8413 8414 ci := &ClusterInfo{Name: s.ClusterName(), RaftGroup: rg.Name} 8415 for _, peer := range rg.Peers { 8416 if sir, ok := s.nodeToInfo.Load(peer); ok && sir != nil { 8417 si := sir.(nodeInfo) 8418 pi := &PeerInfo{Peer: peer, Name: si.name, Current: false, Offline: true} 8419 ci.Replicas = append(ci.Replicas, pi) 8420 } 8421 } 8422 return ci 8423 } 8424 8425 // clusterInfo will report on the status of the raft group. 8426 func (js *jetStream) clusterInfo(rg *raftGroup) *ClusterInfo { 8427 if js == nil { 8428 return nil 8429 } 8430 js.mu.RLock() 8431 defer js.mu.RUnlock() 8432 8433 s := js.srv 8434 if rg == nil || rg.node == nil { 8435 return &ClusterInfo{ 8436 Name: s.cachedClusterName(), 8437 Leader: s.Name(), 8438 } 8439 } 8440 8441 n := rg.node 8442 ci := &ClusterInfo{ 8443 Name: s.cachedClusterName(), 8444 Leader: s.serverNameForNode(n.GroupLeader()), 8445 RaftGroup: rg.Name, 8446 } 8447 8448 now := time.Now() 8449 id, peers := n.ID(), n.Peers() 8450 8451 // If we are leaderless, do not suppress putting us in the peer list. 8452 if ci.Leader == _EMPTY_ { 8453 id = _EMPTY_ 8454 } 8455 8456 for _, rp := range peers { 8457 if rp.ID != id && rg.isMember(rp.ID) { 8458 var lastSeen time.Duration 8459 if now.After(rp.Last) && rp.Last.Unix() != 0 { 8460 lastSeen = now.Sub(rp.Last) 8461 } 8462 current := rp.Current 8463 if current && lastSeen > lostQuorumInterval { 8464 current = false 8465 } 8466 // Create a peer info with common settings if the peer has not been seen 8467 // yet (which can happen after the whole cluster is stopped and only some 8468 // of the nodes are restarted). 8469 pi := &PeerInfo{ 8470 Current: current, 8471 Offline: true, 8472 Active: lastSeen, 8473 Lag: rp.Lag, 8474 Peer: rp.ID, 8475 } 8476 // If node is found, complete/update the settings. 8477 if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil { 8478 si := sir.(nodeInfo) 8479 pi.Name, pi.Offline, pi.cluster = si.name, si.offline, si.cluster 8480 } else { 8481 // If not, then add a name that indicates that the server name 8482 // is unknown at this time, and clear the lag since it is misleading 8483 // (the node may not have that much lag). 8484 // Note: We return now the Peer ID in PeerInfo, so the "(peerID: %s)" 8485 // would technically not be required, but keeping it for now. 8486 pi.Name, pi.Lag = fmt.Sprintf("Server name unknown at this time (peerID: %s)", rp.ID), 0 8487 } 8488 ci.Replicas = append(ci.Replicas, pi) 8489 } 8490 } 8491 // Order the result based on the name so that we get something consistent 8492 // when doing repeated stream info in the CLI, etc... 8493 sort.Slice(ci.Replicas, func(i, j int) bool { 8494 return ci.Replicas[i].Name < ci.Replicas[j].Name 8495 }) 8496 return ci 8497 } 8498 8499 func (mset *stream) checkClusterInfo(ci *ClusterInfo) { 8500 for _, r := range ci.Replicas { 8501 peer := getHash(r.Name) 8502 if lag := mset.lagForCatchupPeer(peer); lag > 0 { 8503 r.Current = false 8504 r.Lag = lag 8505 } 8506 } 8507 } 8508 8509 // Return a list of alternates, ranked by preference order to the request, of stream mirrors. 8510 // This allows clients to select or get more information about read replicas that could be a 8511 // better option to connect to versus the original source. 8512 func (js *jetStream) streamAlternates(ci *ClientInfo, stream string) []StreamAlternate { 8513 if js == nil { 8514 return nil 8515 } 8516 8517 js.mu.RLock() 8518 defer js.mu.RUnlock() 8519 8520 s, cc := js.srv, js.cluster 8521 // Track our domain. 8522 domain := s.getOpts().JetStreamDomain 8523 8524 // No clustering just return nil. 8525 if cc == nil { 8526 return nil 8527 } 8528 acc, _ := s.LookupAccount(ci.serviceAccount()) 8529 if acc == nil { 8530 return nil 8531 } 8532 8533 // Collect our ordering first for clusters. 8534 weights := make(map[string]int) 8535 all := []string{ci.Cluster} 8536 all = append(all, ci.Alternates...) 8537 8538 for i := 0; i < len(all); i++ { 8539 weights[all[i]] = len(all) - i 8540 } 8541 8542 var alts []StreamAlternate 8543 for _, sa := range cc.streams[acc.Name] { 8544 // Add in ourselves and any mirrors. 8545 if sa.Config.Name == stream || (sa.Config.Mirror != nil && sa.Config.Mirror.Name == stream) { 8546 alts = append(alts, StreamAlternate{Name: sa.Config.Name, Domain: domain, Cluster: sa.Group.Cluster}) 8547 } 8548 } 8549 // If just us don't fill in. 8550 if len(alts) == 1 { 8551 return nil 8552 } 8553 8554 // Sort based on our weights that originate from the request itself. 8555 sort.Slice(alts, func(i, j int) bool { 8556 return weights[alts[i].Cluster] > weights[alts[j].Cluster] 8557 }) 8558 8559 return alts 8560 } 8561 8562 // Internal request for stream info, this is coming on the wire so do not block here. 8563 func (mset *stream) handleClusterStreamInfoRequest(_ *subscription, c *client, _ *Account, subject, reply string, _ []byte) { 8564 go mset.processClusterStreamInfoRequest(reply) 8565 } 8566 8567 func (mset *stream) processClusterStreamInfoRequest(reply string) { 8568 mset.mu.RLock() 8569 sysc, js, sa, config := mset.sysc, mset.srv.js.Load(), mset.sa, mset.cfg 8570 isLeader := mset.isLeader() 8571 mset.mu.RUnlock() 8572 8573 // By design all members will receive this. Normally we only want the leader answering. 8574 // But if we have stalled and lost quorom all can respond. 8575 if sa != nil && !js.isGroupLeaderless(sa.Group) && !isLeader { 8576 return 8577 } 8578 8579 // If we are not the leader let someone else possibly respond first. 8580 if !isLeader { 8581 time.Sleep(500 * time.Millisecond) 8582 } 8583 8584 si := &StreamInfo{ 8585 Created: mset.createdTime(), 8586 State: mset.state(), 8587 Config: config, 8588 Cluster: js.clusterInfo(mset.raftGroup()), 8589 Sources: mset.sourcesInfo(), 8590 Mirror: mset.mirrorInfo(), 8591 TimeStamp: time.Now().UTC(), 8592 } 8593 8594 // Check for out of band catchups. 8595 if mset.hasCatchupPeers() { 8596 mset.checkClusterInfo(si.Cluster) 8597 } 8598 8599 sysc.sendInternalMsg(reply, _EMPTY_, nil, si) 8600 } 8601 8602 // 64MB for now, for the total server. This is max we will blast out if asked to 8603 // do so to another server for purposes of catchups. 8604 // This number should be ok on 1Gbit interface. 8605 const defaultMaxTotalCatchupOutBytes = int64(64 * 1024 * 1024) 8606 8607 // Current total outstanding catchup bytes. 8608 func (s *Server) gcbTotal() int64 { 8609 s.gcbMu.RLock() 8610 defer s.gcbMu.RUnlock() 8611 return s.gcbOut 8612 } 8613 8614 // Returns true if Current total outstanding catchup bytes is below 8615 // the maximum configured. 8616 func (s *Server) gcbBelowMax() bool { 8617 s.gcbMu.RLock() 8618 defer s.gcbMu.RUnlock() 8619 return s.gcbOut <= s.gcbOutMax 8620 } 8621 8622 // Adds `sz` to the server's total outstanding catchup bytes and to `localsz` 8623 // under the gcbMu lock. The `localsz` points to the local outstanding catchup 8624 // bytes of the runCatchup go routine of a given stream. 8625 func (s *Server) gcbAdd(localsz *int64, sz int64) { 8626 s.gcbMu.Lock() 8627 atomic.AddInt64(localsz, sz) 8628 s.gcbOut += sz 8629 if s.gcbOut >= s.gcbOutMax && s.gcbKick == nil { 8630 s.gcbKick = make(chan struct{}) 8631 } 8632 s.gcbMu.Unlock() 8633 } 8634 8635 // Removes `sz` from the server's total outstanding catchup bytes and from 8636 // `localsz`, but only if `localsz` is non 0, which would signal that gcSubLast 8637 // has already been invoked. See that function for details. 8638 // Must be invoked under the gcbMu lock. 8639 func (s *Server) gcbSubLocked(localsz *int64, sz int64) { 8640 if atomic.LoadInt64(localsz) == 0 { 8641 return 8642 } 8643 atomic.AddInt64(localsz, -sz) 8644 s.gcbOut -= sz 8645 if s.gcbKick != nil && s.gcbOut < s.gcbOutMax { 8646 close(s.gcbKick) 8647 s.gcbKick = nil 8648 } 8649 } 8650 8651 // Locked version of gcbSubLocked() 8652 func (s *Server) gcbSub(localsz *int64, sz int64) { 8653 s.gcbMu.Lock() 8654 s.gcbSubLocked(localsz, sz) 8655 s.gcbMu.Unlock() 8656 } 8657 8658 // Similar to gcbSub() but reset `localsz` to 0 at the end under the gcbMu lock. 8659 // This will signal further calls to gcbSub() for this `localsz` pointer that 8660 // nothing should be done because runCatchup() has exited and any remaining 8661 // outstanding bytes value has already been decremented. 8662 func (s *Server) gcbSubLast(localsz *int64) { 8663 s.gcbMu.Lock() 8664 s.gcbSubLocked(localsz, *localsz) 8665 *localsz = 0 8666 s.gcbMu.Unlock() 8667 } 8668 8669 // Returns our kick chan, or nil if it does not exist. 8670 func (s *Server) cbKickChan() <-chan struct{} { 8671 s.gcbMu.RLock() 8672 defer s.gcbMu.RUnlock() 8673 return s.gcbKick 8674 } 8675 8676 func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) { 8677 s := mset.srv 8678 defer s.grWG.Done() 8679 8680 const maxOutBytes = int64(64 * 1024 * 1024) // 64MB for now, these are all internal, from server to server 8681 const maxOutMsgs = int32(256 * 1024) // 256k in case we have lots of small messages or skip msgs. 8682 outb := int64(0) 8683 outm := int32(0) 8684 8685 // On abnormal exit make sure to update global total. 8686 defer s.gcbSubLast(&outb) 8687 8688 // Flow control processing. 8689 ackReplySize := func(subj string) int64 { 8690 if li := strings.LastIndexByte(subj, btsep); li > 0 && li < len(subj) { 8691 return parseAckReplyNum(subj[li+1:]) 8692 } 8693 return 0 8694 } 8695 8696 nextBatchC := make(chan struct{}, 1) 8697 nextBatchC <- struct{}{} 8698 remoteQuitCh := make(chan struct{}) 8699 8700 const activityInterval = 30 * time.Second 8701 notActive := time.NewTimer(activityInterval) 8702 defer notActive.Stop() 8703 8704 // Setup ackReply for flow control. 8705 ackReply := syncAckSubject() 8706 ackSub, _ := s.sysSubscribe(ackReply, func(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 8707 if len(msg) > 0 { 8708 s.Warnf("Catchup for stream '%s > %s' was aborted on the remote due to: %q", 8709 mset.account(), mset.name(), msg) 8710 s.sysUnsubscribe(sub) 8711 close(remoteQuitCh) 8712 return 8713 } 8714 sz := ackReplySize(subject) 8715 s.gcbSub(&outb, sz) 8716 atomic.AddInt32(&outm, -1) 8717 mset.updateCatchupPeer(sreq.Peer) 8718 // Kick ourselves and anyone else who might have stalled on global state. 8719 select { 8720 case nextBatchC <- struct{}{}: 8721 // Reset our activity 8722 notActive.Reset(activityInterval) 8723 default: 8724 } 8725 }) 8726 defer s.sysUnsubscribe(ackSub) 8727 ackReplyT := strings.ReplaceAll(ackReply, ".*", ".%d") 8728 8729 // Grab our state. 8730 var state StreamState 8731 mset.mu.RLock() 8732 mset.store.FastState(&state) 8733 mset.mu.RUnlock() 8734 8735 // Reset notion of first if this request wants sequences before our starting sequence 8736 // and we would have nothing to send. If we have partial messages still need to send skips for those. 8737 // We will keep sreq's first sequence to not create sequence mismatches on the follower, but we extend the last to our current state. 8738 if sreq.FirstSeq < state.FirstSeq && state.FirstSeq > sreq.LastSeq { 8739 s.Debugf("Catchup for stream '%s > %s' resetting request first sequence from %d to %d", 8740 mset.account(), mset.name(), sreq.FirstSeq, state.FirstSeq) 8741 if state.LastSeq > sreq.LastSeq { 8742 sreq.LastSeq = state.LastSeq 8743 } 8744 } 8745 8746 // Setup sequences to walk through. 8747 seq, last := sreq.FirstSeq, sreq.LastSeq 8748 mset.setCatchupPeer(sreq.Peer, last-seq) 8749 8750 // Check if we can compress during this. 8751 compressOk := mset.compressAllowed() 8752 8753 var spb int 8754 const minWait = 5 * time.Second 8755 8756 sendNextBatchAndContinue := func(qch chan struct{}) bool { 8757 // Check if we know we will not enter the loop because we are done. 8758 if seq > last { 8759 s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) 8760 // EOF 8761 s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) 8762 return false 8763 } 8764 8765 // If we already sent a batch, we will try to make sure we can at least send a minimum 8766 // batch before sending the next batch. 8767 if spb > 0 { 8768 // Wait til we can send at least 4k 8769 const minBatchWait = int32(4 * 1024) 8770 mw := time.NewTimer(minWait) 8771 for done := false; !done; { 8772 select { 8773 case <-nextBatchC: 8774 done = maxOutMsgs-atomic.LoadInt32(&outm) > minBatchWait 8775 if !done { 8776 // Wait for a small bit. 8777 time.Sleep(50 * time.Millisecond) 8778 } else { 8779 // GC friendly. 8780 mw.Stop() 8781 } 8782 case <-mw.C: 8783 done = true 8784 case <-s.quitCh: 8785 return false 8786 case <-qch: 8787 return false 8788 case <-remoteQuitCh: 8789 return false 8790 } 8791 } 8792 spb = 0 8793 } 8794 8795 // Send an encoded msg. 8796 sendEM := func(em []byte) { 8797 // Place size in reply subject for flow control. 8798 l := int64(len(em)) 8799 reply := fmt.Sprintf(ackReplyT, l) 8800 s.gcbAdd(&outb, l) 8801 atomic.AddInt32(&outm, 1) 8802 s.sendInternalMsgLocked(sendSubject, reply, nil, em) 8803 spb++ 8804 } 8805 8806 // If we support gap markers. 8807 var dr DeleteRange 8808 drOk := sreq.DeleteRangesOk 8809 8810 // Will send our delete range. 8811 // Should already be checked for being valid. 8812 sendDR := func() { 8813 if dr.Num == 1 { 8814 // Send like a normal skip msg. 8815 sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, dr.First, 0)) 8816 } else { 8817 // We have a run, send a gap record. We send these without reply or tracking. 8818 s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, encodeDeleteRange(&dr)) 8819 // Clear out the pending for catchup. 8820 mset.decrementCatchupPeer(sreq.Peer, dr.Num) 8821 } 8822 // Reset always. 8823 dr.First, dr.Num = 0, 0 8824 } 8825 8826 var smv StoreMsg 8827 for ; seq <= last && atomic.LoadInt64(&outb) <= maxOutBytes && atomic.LoadInt32(&outm) <= maxOutMsgs && s.gcbBelowMax(); seq++ { 8828 sm, err := mset.store.LoadMsg(seq, &smv) 8829 // if this is not a deleted msg, bail out. 8830 if err != nil && err != ErrStoreMsgNotFound && err != errDeletedMsg { 8831 if err == ErrStoreEOF { 8832 var state StreamState 8833 mset.store.FastState(&state) 8834 if seq > state.LastSeq { 8835 // The snapshot has a larger last sequence then we have. This could be due to a truncation 8836 // when trying to recover after corruption, still not 100% sure. Could be off by 1 too somehow, 8837 // but tested a ton of those with no success. 8838 s.Warnf("Catchup for stream '%s > %s' completed, but requested sequence %d was larger then current state: %+v", 8839 mset.account(), mset.name(), seq, state) 8840 // Try our best to redo our invalidated snapshot as well. 8841 if n := mset.raftNode(); n != nil { 8842 n.InstallSnapshot(mset.stateSnapshot()) 8843 } 8844 // Signal EOF 8845 s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) 8846 return false 8847 } 8848 } 8849 s.Warnf("Error loading message for catchup '%s > %s': %v", mset.account(), mset.name(), err) 8850 return false 8851 } 8852 8853 if sm != nil { 8854 // If we allow gap markers check if we have one pending. 8855 if drOk && dr.First > 0 { 8856 sendDR() 8857 } 8858 // Send the normal message now. 8859 sendEM(encodeStreamMsgAllowCompress(sm.subj, _EMPTY_, sm.hdr, sm.msg, sm.seq, sm.ts, compressOk)) 8860 } else { 8861 if drOk { 8862 if dr.First == 0 { 8863 dr.First, dr.Num = seq, 1 8864 } else { 8865 dr.Num++ 8866 } 8867 } else { 8868 // Skip record for deleted msg. 8869 sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq, 0)) 8870 } 8871 } 8872 8873 // Check if we are done. 8874 if seq == last { 8875 // Need to see if we have a pending delete range. 8876 if drOk && dr.First > 0 { 8877 sendDR() 8878 } 8879 // Check for a condition where our state's first is now past the last that we could have sent. 8880 // If so reset last and continue sending. 8881 var state StreamState 8882 mset.mu.RLock() 8883 mset.store.FastState(&state) 8884 mset.mu.RUnlock() 8885 if last < state.FirstSeq { 8886 last = state.LastSeq 8887 } 8888 // Recheck our exit condition. 8889 if seq == last { 8890 s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name()) 8891 // EOF 8892 s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil) 8893 return false 8894 } 8895 } 8896 select { 8897 case <-remoteQuitCh: 8898 return false 8899 default: 8900 } 8901 } 8902 if drOk && dr.First > 0 { 8903 sendDR() 8904 } 8905 8906 return true 8907 } 8908 8909 // Check is this stream got closed. 8910 mset.mu.RLock() 8911 qch := mset.qch 8912 mset.mu.RUnlock() 8913 if qch == nil { 8914 return 8915 } 8916 8917 // Run as long as we are still active and need catchup. 8918 // FIXME(dlc) - Purge event? Stream delete? 8919 for { 8920 // Get this each time, will be non-nil if globally blocked and we will close to wake everyone up. 8921 cbKick := s.cbKickChan() 8922 8923 select { 8924 case <-s.quitCh: 8925 return 8926 case <-qch: 8927 return 8928 case <-remoteQuitCh: 8929 mset.clearCatchupPeer(sreq.Peer) 8930 return 8931 case <-notActive.C: 8932 s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name()) 8933 mset.clearCatchupPeer(sreq.Peer) 8934 return 8935 case <-nextBatchC: 8936 if !sendNextBatchAndContinue(qch) { 8937 mset.clearCatchupPeer(sreq.Peer) 8938 return 8939 } 8940 case <-cbKick: 8941 if !sendNextBatchAndContinue(qch) { 8942 mset.clearCatchupPeer(sreq.Peer) 8943 return 8944 } 8945 } 8946 } 8947 } 8948 8949 const jscAllSubj = "$JSC.>" 8950 8951 func syncSubjForStream() string { 8952 return syncSubject("$JSC.SYNC") 8953 } 8954 8955 func syncReplySubject() string { 8956 return syncSubject("$JSC.R") 8957 } 8958 8959 func infoReplySubject() string { 8960 return syncSubject("$JSC.R") 8961 } 8962 8963 func syncAckSubject() string { 8964 return syncSubject("$JSC.ACK") + ".*" 8965 } 8966 8967 func syncSubject(pre string) string { 8968 var sb strings.Builder 8969 sb.WriteString(pre) 8970 sb.WriteByte(btsep) 8971 8972 var b [replySuffixLen]byte 8973 rn := rand.Int63() 8974 for i, l := 0, rn; i < len(b); i++ { 8975 b[i] = digits[l%base] 8976 l /= base 8977 } 8978 8979 sb.Write(b[:]) 8980 return sb.String() 8981 } 8982 8983 const ( 8984 clusterStreamInfoT = "$JSC.SI.%s.%s" 8985 clusterConsumerInfoT = "$JSC.CI.%s.%s.%s" 8986 jsaUpdatesSubT = "$JSC.ARU.%s.*" 8987 jsaUpdatesPubT = "$JSC.ARU.%s.%s" 8988 )