get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/server/jetstream_cluster.go (about)

     1  // Copyright 2020-2024 The NATS Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package server
    15  
    16  import (
    17  	"bytes"
    18  	crand "crypto/rand"
    19  	"encoding/binary"
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"math"
    24  	"math/rand"
    25  	"os"
    26  	"path/filepath"
    27  	"reflect"
    28  	"sort"
    29  	"strings"
    30  	"sync/atomic"
    31  	"time"
    32  
    33  	"github.com/klauspost/compress/s2"
    34  	"github.com/minio/highwayhash"
    35  	"github.com/nats-io/nuid"
    36  )
    37  
    38  // jetStreamCluster holds information about the meta group and stream assignments.
    39  type jetStreamCluster struct {
    40  	// The metacontroller raftNode.
    41  	meta RaftNode
    42  	// For stream and consumer assignments. All servers will have this be the same.
    43  	// ACCOUNT -> STREAM -> Stream Assignment -> Consumers
    44  	streams map[string]map[string]*streamAssignment
    45  	// These are inflight proposals and used to apply limits when there are
    46  	// concurrent requests that would otherwise be accepted.
    47  	// We also record the group for the stream. This is needed since if we have
    48  	// concurrent requests for same account and stream we need to let it process to get
    49  	// a response but they need to be same group, peers etc.
    50  	inflight map[string]map[string]*raftGroup
    51  	// Signals meta-leader should check the stream assignments.
    52  	streamsCheck bool
    53  	// Server.
    54  	s *Server
    55  	// Internal client.
    56  	c *client
    57  	// Processing assignment results.
    58  	streamResults   *subscription
    59  	consumerResults *subscription
    60  	// System level request to have the leader stepdown.
    61  	stepdown *subscription
    62  	// System level requests to remove a peer.
    63  	peerRemove *subscription
    64  	// System level request to move a stream
    65  	peerStreamMove *subscription
    66  	// System level request to cancel a stream move
    67  	peerStreamCancelMove *subscription
    68  	// To pop out the monitorCluster before the raft layer.
    69  	qch chan struct{}
    70  }
    71  
    72  // Used to guide placement of streams and meta controllers in clustered JetStream.
    73  type Placement struct {
    74  	Cluster string   `json:"cluster,omitempty"`
    75  	Tags    []string `json:"tags,omitempty"`
    76  }
    77  
    78  // Define types of the entry.
    79  type entryOp uint8
    80  
    81  // ONLY ADD TO THE END, DO NOT INSERT IN BETWEEN WILL BREAK SERVER INTEROP.
    82  const (
    83  	// Meta ops.
    84  	assignStreamOp entryOp = iota
    85  	assignConsumerOp
    86  	removeStreamOp
    87  	removeConsumerOp
    88  	// Stream ops.
    89  	streamMsgOp
    90  	purgeStreamOp
    91  	deleteMsgOp
    92  	// Consumer ops.
    93  	updateDeliveredOp
    94  	updateAcksOp
    95  	// Compressed consumer assignments.
    96  	assignCompressedConsumerOp
    97  	// Filtered Consumer skip.
    98  	updateSkipOp
    99  	// Update Stream.
   100  	updateStreamOp
   101  	// For updating information on pending pull requests.
   102  	addPendingRequest
   103  	removePendingRequest
   104  	// For sending compressed streams, either through RAFT or catchup.
   105  	compressedStreamMsgOp
   106  	// For sending deleted gaps on catchups for replicas.
   107  	deleteRangeOp
   108  )
   109  
   110  // raftGroups are controlled by the metagroup controller.
   111  // The raftGroups will house streams and consumers.
   112  type raftGroup struct {
   113  	Name      string      `json:"name"`
   114  	Peers     []string    `json:"peers"`
   115  	Storage   StorageType `json:"store"`
   116  	Cluster   string      `json:"cluster,omitempty"`
   117  	Preferred string      `json:"preferred,omitempty"`
   118  	// Internal
   119  	node RaftNode
   120  }
   121  
   122  // streamAssignment is what the meta controller uses to assign streams to peers.
   123  type streamAssignment struct {
   124  	Client  *ClientInfo   `json:"client,omitempty"`
   125  	Created time.Time     `json:"created"`
   126  	Config  *StreamConfig `json:"stream"`
   127  	Group   *raftGroup    `json:"group"`
   128  	Sync    string        `json:"sync"`
   129  	Subject string        `json:"subject"`
   130  	Reply   string        `json:"reply"`
   131  	Restore *StreamState  `json:"restore_state,omitempty"`
   132  	// Internal
   133  	consumers  map[string]*consumerAssignment
   134  	responded  bool
   135  	recovering bool
   136  	err        error
   137  }
   138  
   139  // consumerAssignment is what the meta controller uses to assign consumers to streams.
   140  type consumerAssignment struct {
   141  	Client  *ClientInfo     `json:"client,omitempty"`
   142  	Created time.Time       `json:"created"`
   143  	Name    string          `json:"name"`
   144  	Stream  string          `json:"stream"`
   145  	Config  *ConsumerConfig `json:"consumer"`
   146  	Group   *raftGroup      `json:"group"`
   147  	Subject string          `json:"subject"`
   148  	Reply   string          `json:"reply"`
   149  	State   *ConsumerState  `json:"state,omitempty"`
   150  	// Internal
   151  	responded  bool
   152  	recovering bool
   153  	deleted    bool
   154  	err        error
   155  }
   156  
   157  // streamPurge is what the stream leader will replicate when purging a stream.
   158  type streamPurge struct {
   159  	Client  *ClientInfo              `json:"client,omitempty"`
   160  	Stream  string                   `json:"stream"`
   161  	LastSeq uint64                   `json:"last_seq"`
   162  	Subject string                   `json:"subject"`
   163  	Reply   string                   `json:"reply"`
   164  	Request *JSApiStreamPurgeRequest `json:"request,omitempty"`
   165  }
   166  
   167  // streamMsgDelete is what the stream leader will replicate when deleting a message.
   168  type streamMsgDelete struct {
   169  	Client  *ClientInfo `json:"client,omitempty"`
   170  	Stream  string      `json:"stream"`
   171  	Seq     uint64      `json:"seq"`
   172  	NoErase bool        `json:"no_erase,omitempty"`
   173  	Subject string      `json:"subject"`
   174  	Reply   string      `json:"reply"`
   175  }
   176  
   177  const (
   178  	defaultStoreDirName  = "_js_"
   179  	defaultMetaGroupName = "_meta_"
   180  	defaultMetaFSBlkSize = 1024 * 1024
   181  	jsExcludePlacement   = "!jetstream"
   182  )
   183  
   184  // Returns information useful in mixed mode.
   185  func (s *Server) trackedJetStreamServers() (js, total int) {
   186  	s.mu.RLock()
   187  	defer s.mu.RUnlock()
   188  	if !s.isRunning() || !s.eventsEnabled() {
   189  		return -1, -1
   190  	}
   191  	s.nodeToInfo.Range(func(k, v interface{}) bool {
   192  		si := v.(nodeInfo)
   193  		if si.js {
   194  			js++
   195  		}
   196  		total++
   197  		return true
   198  	})
   199  	return js, total
   200  }
   201  
   202  func (s *Server) getJetStreamCluster() (*jetStream, *jetStreamCluster) {
   203  	if s.isShuttingDown() {
   204  		return nil, nil
   205  	}
   206  
   207  	js := s.getJetStream()
   208  	if js == nil {
   209  		return nil, nil
   210  	}
   211  
   212  	// Only set once, do not need a lock.
   213  	return js, js.cluster
   214  }
   215  
   216  func (s *Server) JetStreamIsClustered() bool {
   217  	js := s.getJetStream()
   218  	if js == nil {
   219  		return false
   220  	}
   221  	return js.isClustered()
   222  }
   223  
   224  func (s *Server) JetStreamIsLeader() bool {
   225  	return s.isMetaLeader.Load()
   226  }
   227  
   228  func (s *Server) JetStreamIsCurrent() bool {
   229  	js := s.getJetStream()
   230  	if js == nil {
   231  		return false
   232  	}
   233  	// Grab what we need and release js lock.
   234  	js.mu.RLock()
   235  	var meta RaftNode
   236  	cc := js.cluster
   237  	if cc != nil {
   238  		meta = cc.meta
   239  	}
   240  	js.mu.RUnlock()
   241  
   242  	if cc == nil {
   243  		// Non-clustered mode
   244  		return true
   245  	}
   246  	return meta.Current()
   247  }
   248  
   249  func (s *Server) JetStreamSnapshotMeta() error {
   250  	js := s.getJetStream()
   251  	if js == nil {
   252  		return NewJSNotEnabledError()
   253  	}
   254  	js.mu.RLock()
   255  	cc := js.cluster
   256  	isLeader := cc.isLeader()
   257  	meta := cc.meta
   258  	js.mu.RUnlock()
   259  
   260  	if !isLeader {
   261  		return errNotLeader
   262  	}
   263  
   264  	return meta.InstallSnapshot(js.metaSnapshot())
   265  }
   266  
   267  func (s *Server) JetStreamStepdownStream(account, stream string) error {
   268  	js, cc := s.getJetStreamCluster()
   269  	if js == nil {
   270  		return NewJSNotEnabledError()
   271  	}
   272  	if cc == nil {
   273  		return NewJSClusterNotActiveError()
   274  	}
   275  	// Grab account
   276  	acc, err := s.LookupAccount(account)
   277  	if err != nil {
   278  		return err
   279  	}
   280  	// Grab stream
   281  	mset, err := acc.lookupStream(stream)
   282  	if err != nil {
   283  		return err
   284  	}
   285  
   286  	if node := mset.raftNode(); node != nil && node.Leader() {
   287  		node.StepDown()
   288  	}
   289  
   290  	return nil
   291  }
   292  
   293  func (s *Server) JetStreamStepdownConsumer(account, stream, consumer string) error {
   294  	js, cc := s.getJetStreamCluster()
   295  	if js == nil {
   296  		return NewJSNotEnabledError()
   297  	}
   298  	if cc == nil {
   299  		return NewJSClusterNotActiveError()
   300  	}
   301  	// Grab account
   302  	acc, err := s.LookupAccount(account)
   303  	if err != nil {
   304  		return err
   305  	}
   306  	// Grab stream
   307  	mset, err := acc.lookupStream(stream)
   308  	if err != nil {
   309  		return err
   310  	}
   311  
   312  	o := mset.lookupConsumer(consumer)
   313  	if o == nil {
   314  		return NewJSConsumerNotFoundError()
   315  	}
   316  
   317  	if node := o.raftNode(); node != nil && node.Leader() {
   318  		node.StepDown()
   319  	}
   320  
   321  	return nil
   322  }
   323  
   324  func (s *Server) JetStreamSnapshotStream(account, stream string) error {
   325  	js, cc := s.getJetStreamCluster()
   326  	if js == nil {
   327  		return NewJSNotEnabledForAccountError()
   328  	}
   329  	if cc == nil {
   330  		return NewJSClusterNotActiveError()
   331  	}
   332  	// Grab account
   333  	acc, err := s.LookupAccount(account)
   334  	if err != nil {
   335  		return err
   336  	}
   337  	// Grab stream
   338  	mset, err := acc.lookupStream(stream)
   339  	if err != nil {
   340  		return err
   341  	}
   342  
   343  	// Hold lock when installing snapshot.
   344  	mset.mu.Lock()
   345  	if mset.node == nil {
   346  		mset.mu.Unlock()
   347  		return nil
   348  	}
   349  	err = mset.node.InstallSnapshot(mset.stateSnapshotLocked())
   350  	mset.mu.Unlock()
   351  
   352  	return err
   353  }
   354  
   355  func (s *Server) JetStreamClusterPeers() []string {
   356  	js := s.getJetStream()
   357  	if js == nil {
   358  		return nil
   359  	}
   360  	js.mu.RLock()
   361  	defer js.mu.RUnlock()
   362  
   363  	cc := js.cluster
   364  	if !cc.isLeader() || cc.meta == nil {
   365  		return nil
   366  	}
   367  	peers := cc.meta.Peers()
   368  	var nodes []string
   369  	for _, p := range peers {
   370  		si, ok := s.nodeToInfo.Load(p.ID)
   371  		if !ok || si == nil {
   372  			continue
   373  		}
   374  		ni := si.(nodeInfo)
   375  		// Ignore if offline, no JS, or no current stats have been received.
   376  		if ni.offline || !ni.js || ni.stats == nil {
   377  			continue
   378  		}
   379  		nodes = append(nodes, si.(nodeInfo).name)
   380  	}
   381  	return nodes
   382  }
   383  
   384  // Read lock should be held.
   385  func (cc *jetStreamCluster) isLeader() bool {
   386  	if cc == nil {
   387  		// Non-clustered mode
   388  		return true
   389  	}
   390  	return cc.meta != nil && cc.meta.Leader()
   391  }
   392  
   393  // isStreamCurrent will determine if the stream is up to date.
   394  // For R1 it will make sure the stream is present on this server.
   395  // Read lock should be held.
   396  func (cc *jetStreamCluster) isStreamCurrent(account, stream string) bool {
   397  	if cc == nil {
   398  		// Non-clustered mode
   399  		return true
   400  	}
   401  	as := cc.streams[account]
   402  	if as == nil {
   403  		return false
   404  	}
   405  	sa := as[stream]
   406  	if sa == nil {
   407  		return false
   408  	}
   409  	rg := sa.Group
   410  	if rg == nil {
   411  		return false
   412  	}
   413  
   414  	if rg.node == nil || rg.node.Current() {
   415  		// Check if we are processing a snapshot and are catching up.
   416  		acc, err := cc.s.LookupAccount(account)
   417  		if err != nil {
   418  			return false
   419  		}
   420  		mset, err := acc.lookupStream(stream)
   421  		if err != nil {
   422  			return false
   423  		}
   424  		if mset.isCatchingUp() {
   425  			return false
   426  		}
   427  		// Success.
   428  		return true
   429  	}
   430  
   431  	return false
   432  }
   433  
   434  // Restart the stream in question.
   435  // Should only be called when the stream is known to be in a bad state.
   436  func (js *jetStream) restartStream(acc *Account, csa *streamAssignment) {
   437  	js.mu.Lock()
   438  	s, cc := js.srv, js.cluster
   439  	if cc == nil {
   440  		js.mu.Unlock()
   441  		return
   442  	}
   443  	// Need to lookup the one directly from the meta layer, what we get handed is a copy if coming from isStreamHealthy.
   444  	asa := cc.streams[acc.Name]
   445  	if asa == nil {
   446  		js.mu.Unlock()
   447  		return
   448  	}
   449  	sa := asa[csa.Config.Name]
   450  	if sa == nil {
   451  		js.mu.Unlock()
   452  		return
   453  	}
   454  	// Make sure to clear out the raft node if still present in the meta layer.
   455  	if rg := sa.Group; rg != nil && rg.node != nil {
   456  		if rg.node.State() != Closed {
   457  			rg.node.Stop()
   458  		}
   459  		rg.node = nil
   460  	}
   461  	sinceCreation := time.Since(sa.Created)
   462  	js.mu.Unlock()
   463  
   464  	// Process stream assignment to recreate.
   465  	// Check that we have given system enough time to start us up.
   466  	// This will be longer than obvious, and matches consumer logic in case system very busy.
   467  	if sinceCreation < 10*time.Second {
   468  		s.Debugf("Not restarting missing stream '%s > %s', too soon since creation %v",
   469  			acc, csa.Config.Name, sinceCreation)
   470  		return
   471  	}
   472  
   473  	js.processStreamAssignment(sa)
   474  
   475  	// If we had consumers assigned to this server they will be present in the copy, csa.
   476  	// They also need to be processed. The csa consumers is a copy of only our consumers,
   477  	// those assigned to us, but the consumer assignment's there are direct from the meta
   478  	// layer to make this part much easier and avoid excessive lookups.
   479  	for _, cca := range csa.consumers {
   480  		if cca.deleted {
   481  			continue
   482  		}
   483  		// Need to look up original as well here to make sure node is nil.
   484  		js.mu.Lock()
   485  		ca := sa.consumers[cca.Name]
   486  		if ca != nil && ca.Group != nil {
   487  			// Make sure the node is stopped if still running.
   488  			if node := ca.Group.node; node != nil && node.State() != Closed {
   489  				node.Stop()
   490  			}
   491  			// Make sure node is wiped.
   492  			ca.Group.node = nil
   493  		}
   494  		js.mu.Unlock()
   495  		if ca != nil {
   496  			js.processConsumerAssignment(ca)
   497  		}
   498  	}
   499  }
   500  
   501  // isStreamHealthy will determine if the stream is up to date or very close.
   502  // For R1 it will make sure the stream is present on this server.
   503  func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool {
   504  	js.mu.RLock()
   505  	s, cc := js.srv, js.cluster
   506  	if cc == nil {
   507  		// Non-clustered mode
   508  		js.mu.RUnlock()
   509  		return true
   510  	}
   511  
   512  	// Pull the group out.
   513  	rg := sa.Group
   514  	if rg == nil {
   515  		js.mu.RUnlock()
   516  		return false
   517  	}
   518  
   519  	streamName := sa.Config.Name
   520  	node := rg.node
   521  	js.mu.RUnlock()
   522  
   523  	// First lookup stream and make sure its there.
   524  	mset, err := acc.lookupStream(streamName)
   525  	if err != nil {
   526  		js.restartStream(acc, sa)
   527  		return false
   528  	}
   529  
   530  	// If we are catching up return false.
   531  	if mset.isCatchingUp() {
   532  		return false
   533  	}
   534  
   535  	if node == nil || node.Healthy() {
   536  		// Check if we are processing a snapshot and are catching up.
   537  		if !mset.isCatchingUp() {
   538  			return true
   539  		}
   540  	} else if node != nil {
   541  		if node != mset.raftNode() {
   542  			s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName)
   543  			node.Delete()
   544  			mset.resetClusteredState(nil)
   545  		} else if node.State() == Closed {
   546  			js.restartStream(acc, sa)
   547  		}
   548  	}
   549  
   550  	return false
   551  }
   552  
   553  // isConsumerCurrent will determine if the consumer is up to date.
   554  // For R1 it will make sure the consunmer is present on this server.
   555  func (js *jetStream) isConsumerHealthy(mset *stream, consumer string, ca *consumerAssignment) bool {
   556  	if mset == nil {
   557  		return false
   558  	}
   559  
   560  	js.mu.RLock()
   561  	cc := js.cluster
   562  	if cc == nil {
   563  		// Non-clustered mode
   564  		js.mu.RUnlock()
   565  		return true
   566  	}
   567  	// These are required.
   568  	if ca == nil || ca.Group == nil {
   569  		js.mu.RUnlock()
   570  		return false
   571  	}
   572  	s := js.srv
   573  	js.mu.RUnlock()
   574  
   575  	// Capture RAFT node from assignment.
   576  	node := ca.Group.node
   577  
   578  	// When we try to restart we nil out the node if applicable
   579  	// and reprocess the consumer assignment.
   580  	restartConsumer := func() {
   581  		mset.mu.RLock()
   582  		accName, streamName := mset.acc.GetName(), mset.cfg.Name
   583  		mset.mu.RUnlock()
   584  
   585  		js.mu.Lock()
   586  		deleted := ca.deleted
   587  		// Check that we have not just been created.
   588  		if !deleted && time.Since(ca.Created) < 10*time.Second {
   589  			s.Debugf("Not restarting missing consumer '%s > %s > %s', too soon since creation %v",
   590  				accName, streamName, consumer, time.Since(ca.Created))
   591  			js.mu.Unlock()
   592  			return
   593  		}
   594  		// Make sure the node is stopped if still running.
   595  		if node != nil && node.State() != Closed {
   596  			node.Stop()
   597  		}
   598  		ca.Group.node = nil
   599  		js.mu.Unlock()
   600  		if !deleted {
   601  			js.processConsumerAssignment(ca)
   602  		}
   603  	}
   604  
   605  	// Check if not running at all.
   606  	o := mset.lookupConsumer(consumer)
   607  	if o == nil {
   608  		restartConsumer()
   609  		return false
   610  	}
   611  
   612  	// Check RAFT node state.
   613  	if node == nil || node.Healthy() {
   614  		return true
   615  	} else if node != nil {
   616  		if node != o.raftNode() {
   617  			mset.mu.RLock()
   618  			accName, streamName := mset.acc.GetName(), mset.cfg.Name
   619  			mset.mu.RUnlock()
   620  			s.Warnf("Detected consumer cluster node skew '%s > %s > %s'", accName, streamName, consumer)
   621  			node.Delete()
   622  			o.deleteWithoutAdvisory()
   623  			restartConsumer()
   624  		} else if node.State() == Closed {
   625  			// We have a consumer, and it should have a running node but it is closed.
   626  			o.stop()
   627  			restartConsumer()
   628  		}
   629  	}
   630  	return false
   631  }
   632  
   633  // subjectsOverlap checks all existing stream assignments for the account cross-cluster for subject overlap
   634  // Use only for clustered JetStream
   635  // Read lock should be held.
   636  func (jsc *jetStreamCluster) subjectsOverlap(acc string, subjects []string, osa *streamAssignment) bool {
   637  	asa := jsc.streams[acc]
   638  	for _, sa := range asa {
   639  		// can't overlap yourself, assume osa pre-checked for deep equal if passed
   640  		if osa != nil && sa == osa {
   641  			continue
   642  		}
   643  		for _, subj := range sa.Config.Subjects {
   644  			for _, tsubj := range subjects {
   645  				if SubjectsCollide(tsubj, subj) {
   646  					return true
   647  				}
   648  			}
   649  		}
   650  	}
   651  	return false
   652  }
   653  
   654  func (a *Account) getJetStreamFromAccount() (*Server, *jetStream, *jsAccount) {
   655  	a.mu.RLock()
   656  	jsa := a.js
   657  	a.mu.RUnlock()
   658  	if jsa == nil {
   659  		return nil, nil, nil
   660  	}
   661  	jsa.mu.RLock()
   662  	js := jsa.js
   663  	jsa.mu.RUnlock()
   664  	if js == nil {
   665  		return nil, nil, nil
   666  	}
   667  	// Lock not needed, set on creation.
   668  	s := js.srv
   669  	return s, js, jsa
   670  }
   671  
   672  func (s *Server) JetStreamIsStreamLeader(account, stream string) bool {
   673  	js, cc := s.getJetStreamCluster()
   674  	if js == nil || cc == nil {
   675  		return false
   676  	}
   677  	js.mu.RLock()
   678  	defer js.mu.RUnlock()
   679  	return cc.isStreamLeader(account, stream)
   680  }
   681  
   682  func (a *Account) JetStreamIsStreamLeader(stream string) bool {
   683  	s, js, jsa := a.getJetStreamFromAccount()
   684  	if s == nil || js == nil || jsa == nil {
   685  		return false
   686  	}
   687  	js.mu.RLock()
   688  	defer js.mu.RUnlock()
   689  	return js.cluster.isStreamLeader(a.Name, stream)
   690  }
   691  
   692  func (s *Server) JetStreamIsStreamCurrent(account, stream string) bool {
   693  	js, cc := s.getJetStreamCluster()
   694  	if js == nil {
   695  		return false
   696  	}
   697  	js.mu.RLock()
   698  	defer js.mu.RUnlock()
   699  	return cc.isStreamCurrent(account, stream)
   700  }
   701  
   702  func (a *Account) JetStreamIsConsumerLeader(stream, consumer string) bool {
   703  	s, js, jsa := a.getJetStreamFromAccount()
   704  	if s == nil || js == nil || jsa == nil {
   705  		return false
   706  	}
   707  	js.mu.RLock()
   708  	defer js.mu.RUnlock()
   709  	return js.cluster.isConsumerLeader(a.Name, stream, consumer)
   710  }
   711  
   712  func (s *Server) JetStreamIsConsumerLeader(account, stream, consumer string) bool {
   713  	js, cc := s.getJetStreamCluster()
   714  	if js == nil || cc == nil {
   715  		return false
   716  	}
   717  	js.mu.RLock()
   718  	defer js.mu.RUnlock()
   719  	return cc.isConsumerLeader(account, stream, consumer)
   720  }
   721  
   722  func (s *Server) enableJetStreamClustering() error {
   723  	if !s.isRunning() {
   724  		return nil
   725  	}
   726  	js := s.getJetStream()
   727  	if js == nil {
   728  		return NewJSNotEnabledForAccountError()
   729  	}
   730  	// Already set.
   731  	if js.cluster != nil {
   732  		return nil
   733  	}
   734  
   735  	s.Noticef("Starting JetStream cluster")
   736  	// We need to determine if we have a stable cluster name and expected number of servers.
   737  	s.Debugf("JetStream cluster checking for stable cluster name and peers")
   738  
   739  	hasLeafNodeSystemShare := s.canExtendOtherDomain()
   740  	if s.isClusterNameDynamic() && !hasLeafNodeSystemShare {
   741  		return errors.New("JetStream cluster requires cluster name")
   742  	}
   743  	return js.setupMetaGroup()
   744  }
   745  
   746  // isClustered returns if we are clustered.
   747  // Lock should not be held.
   748  func (js *jetStream) isClustered() bool {
   749  	// This is only ever set, no need for lock here.
   750  	return js.cluster != nil
   751  }
   752  
   753  // isClusteredNoLock returns if we are clustered, but unlike isClustered() does
   754  // not use the jetstream's lock, instead, uses an atomic operation.
   755  // There are situations where some code wants to know if we are clustered but
   756  // can't use js.isClustered() without causing a lock inversion.
   757  func (js *jetStream) isClusteredNoLock() bool {
   758  	return atomic.LoadInt32(&js.clustered) == 1
   759  }
   760  
   761  func (js *jetStream) setupMetaGroup() error {
   762  	s := js.srv
   763  	s.Noticef("Creating JetStream metadata controller")
   764  
   765  	// Setup our WAL for the metagroup.
   766  	sysAcc := s.SystemAccount()
   767  	storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, defaultMetaGroupName)
   768  
   769  	fs, err := newFileStoreWithCreated(
   770  		FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMetaFSBlkSize, AsyncFlush: false, srv: s},
   771  		StreamConfig{Name: defaultMetaGroupName, Storage: FileStorage},
   772  		time.Now().UTC(),
   773  		s.jsKeyGen(s.getOpts().JetStreamKey, defaultMetaGroupName),
   774  		s.jsKeyGen(s.getOpts().JetStreamOldKey, defaultMetaGroupName),
   775  	)
   776  	if err != nil {
   777  		s.Errorf("Error creating filestore: %v", err)
   778  		return err
   779  	}
   780  
   781  	cfg := &RaftConfig{Name: defaultMetaGroupName, Store: storeDir, Log: fs}
   782  
   783  	// If we are soliciting leafnode connections and we are sharing a system account and do not disable it with a hint,
   784  	// we want to move to observer mode so that we extend the solicited cluster or supercluster but do not form our own.
   785  	cfg.Observer = s.canExtendOtherDomain() && s.getOpts().JetStreamExtHint != jsNoExtend
   786  
   787  	var bootstrap bool
   788  	if ps, err := readPeerState(storeDir); err != nil {
   789  		s.Noticef("JetStream cluster bootstrapping")
   790  		bootstrap = true
   791  		peers := s.ActivePeers()
   792  		s.Debugf("JetStream cluster initial peers: %+v", peers)
   793  		if err := s.bootstrapRaftNode(cfg, peers, false); err != nil {
   794  			return err
   795  		}
   796  		if cfg.Observer {
   797  			s.Noticef("Turning JetStream metadata controller Observer Mode on")
   798  		}
   799  	} else {
   800  		s.Noticef("JetStream cluster recovering state")
   801  		// correlate the value of observer with observations from a previous run.
   802  		if cfg.Observer {
   803  			switch ps.domainExt {
   804  			case extExtended:
   805  				s.Noticef("Keeping JetStream metadata controller Observer Mode on - due to previous contact")
   806  			case extNotExtended:
   807  				s.Noticef("Turning JetStream metadata controller Observer Mode off - due to previous contact")
   808  				cfg.Observer = false
   809  			case extUndetermined:
   810  				s.Noticef("Turning JetStream metadata controller Observer Mode on - no previous contact")
   811  				s.Noticef("In cases where JetStream will not be extended")
   812  				s.Noticef("and waiting for leader election until first contact is not acceptable,")
   813  				s.Noticef(`manually disable Observer Mode by setting the JetStream Option "extension_hint: %s"`, jsNoExtend)
   814  			}
   815  		} else {
   816  			// To track possible configuration changes, responsible for an altered value of cfg.Observer,
   817  			// set extension state to undetermined.
   818  			ps.domainExt = extUndetermined
   819  			if err := writePeerState(storeDir, ps); err != nil {
   820  				return err
   821  			}
   822  		}
   823  	}
   824  
   825  	// Start up our meta node.
   826  	n, err := s.startRaftNode(sysAcc.GetName(), cfg, pprofLabels{
   827  		"type":    "metaleader",
   828  		"account": sysAcc.Name,
   829  	})
   830  	if err != nil {
   831  		s.Warnf("Could not start metadata controller: %v", err)
   832  		return err
   833  	}
   834  
   835  	// If we are bootstrapped with no state, start campaign early.
   836  	if bootstrap {
   837  		n.Campaign()
   838  	}
   839  
   840  	c := s.createInternalJetStreamClient()
   841  	sacc := s.SystemAccount()
   842  
   843  	js.mu.Lock()
   844  	defer js.mu.Unlock()
   845  	js.cluster = &jetStreamCluster{
   846  		meta:    n,
   847  		streams: make(map[string]map[string]*streamAssignment),
   848  		s:       s,
   849  		c:       c,
   850  		qch:     make(chan struct{}),
   851  	}
   852  	atomic.StoreInt32(&js.clustered, 1)
   853  	c.registerWithAccount(sacc)
   854  
   855  	js.srv.startGoRoutine(
   856  		js.monitorCluster,
   857  		pprofLabels{
   858  			"type":    "metaleader",
   859  			"account": sacc.Name,
   860  		},
   861  	)
   862  	return nil
   863  }
   864  
   865  func (js *jetStream) getMetaGroup() RaftNode {
   866  	js.mu.RLock()
   867  	defer js.mu.RUnlock()
   868  	if js.cluster == nil {
   869  		return nil
   870  	}
   871  	return js.cluster.meta
   872  }
   873  
   874  func (js *jetStream) server() *Server {
   875  	// Lock not needed, only set once on creation.
   876  	return js.srv
   877  }
   878  
   879  // Will respond if we do not think we have a metacontroller leader.
   880  func (js *jetStream) isLeaderless() bool {
   881  	js.mu.RLock()
   882  	defer js.mu.RUnlock()
   883  
   884  	cc := js.cluster
   885  	if cc == nil || cc.meta == nil {
   886  		return false
   887  	}
   888  	// If we don't have a leader.
   889  	// Make sure we have been running for enough time.
   890  	if cc.meta.GroupLeader() == _EMPTY_ && time.Since(cc.meta.Created()) > lostQuorumIntervalDefault {
   891  		return true
   892  	}
   893  	return false
   894  }
   895  
   896  // Will respond iff we are a member and we know we have no leader.
   897  func (js *jetStream) isGroupLeaderless(rg *raftGroup) bool {
   898  	if rg == nil || js == nil {
   899  		return false
   900  	}
   901  	js.mu.RLock()
   902  	defer js.mu.RUnlock()
   903  
   904  	cc := js.cluster
   905  
   906  	// If we are not a member we can not say..
   907  	if cc.meta == nil {
   908  		return false
   909  	}
   910  	if !rg.isMember(cc.meta.ID()) {
   911  		return false
   912  	}
   913  	// Single peer groups always have a leader if we are here.
   914  	if rg.node == nil {
   915  		return false
   916  	}
   917  	// If we don't have a leader.
   918  	if rg.node.GroupLeader() == _EMPTY_ {
   919  		// Threshold for jetstream startup.
   920  		const startupThreshold = 10 * time.Second
   921  
   922  		if rg.node.HadPreviousLeader() {
   923  			// Make sure we have been running long enough to intelligently determine this.
   924  			if time.Since(js.started) > startupThreshold {
   925  				return true
   926  			}
   927  		}
   928  		// Make sure we have been running for enough time.
   929  		if time.Since(rg.node.Created()) > lostQuorumIntervalDefault {
   930  			return true
   931  		}
   932  	}
   933  
   934  	return false
   935  }
   936  
   937  func (s *Server) JetStreamIsStreamAssigned(account, stream string) bool {
   938  	js, cc := s.getJetStreamCluster()
   939  	if js == nil || cc == nil {
   940  		return false
   941  	}
   942  	acc, _ := s.LookupAccount(account)
   943  	if acc == nil {
   944  		return false
   945  	}
   946  	js.mu.RLock()
   947  	assigned := cc.isStreamAssigned(acc, stream)
   948  	js.mu.RUnlock()
   949  	return assigned
   950  }
   951  
   952  // streamAssigned informs us if this server has this stream assigned.
   953  func (jsa *jsAccount) streamAssigned(stream string) bool {
   954  	jsa.mu.RLock()
   955  	js, acc := jsa.js, jsa.account
   956  	jsa.mu.RUnlock()
   957  
   958  	if js == nil {
   959  		return false
   960  	}
   961  	js.mu.RLock()
   962  	assigned := js.cluster.isStreamAssigned(acc, stream)
   963  	js.mu.RUnlock()
   964  	return assigned
   965  }
   966  
   967  // Read lock should be held.
   968  func (cc *jetStreamCluster) isStreamAssigned(a *Account, stream string) bool {
   969  	// Non-clustered mode always return true.
   970  	if cc == nil {
   971  		return true
   972  	}
   973  	if cc.meta == nil {
   974  		return false
   975  	}
   976  	as := cc.streams[a.Name]
   977  	if as == nil {
   978  		return false
   979  	}
   980  	sa := as[stream]
   981  	if sa == nil {
   982  		return false
   983  	}
   984  	rg := sa.Group
   985  	if rg == nil {
   986  		return false
   987  	}
   988  	// Check if we are the leader of this raftGroup assigned to the stream.
   989  	ourID := cc.meta.ID()
   990  	for _, peer := range rg.Peers {
   991  		if peer == ourID {
   992  			return true
   993  		}
   994  	}
   995  	return false
   996  }
   997  
   998  // Read lock should be held.
   999  func (cc *jetStreamCluster) isStreamLeader(account, stream string) bool {
  1000  	// Non-clustered mode always return true.
  1001  	if cc == nil {
  1002  		return true
  1003  	}
  1004  	if cc.meta == nil {
  1005  		return false
  1006  	}
  1007  
  1008  	var sa *streamAssignment
  1009  	if as := cc.streams[account]; as != nil {
  1010  		sa = as[stream]
  1011  	}
  1012  	if sa == nil {
  1013  		return false
  1014  	}
  1015  	rg := sa.Group
  1016  	if rg == nil {
  1017  		return false
  1018  	}
  1019  	// Check if we are the leader of this raftGroup assigned to the stream.
  1020  	ourID := cc.meta.ID()
  1021  	for _, peer := range rg.Peers {
  1022  		if peer == ourID {
  1023  			if len(rg.Peers) == 1 || rg.node != nil && rg.node.Leader() {
  1024  				return true
  1025  			}
  1026  		}
  1027  	}
  1028  	return false
  1029  }
  1030  
  1031  // Read lock should be held.
  1032  func (cc *jetStreamCluster) isConsumerLeader(account, stream, consumer string) bool {
  1033  	// Non-clustered mode always return true.
  1034  	if cc == nil {
  1035  		return true
  1036  	}
  1037  	if cc.meta == nil {
  1038  		return false
  1039  	}
  1040  
  1041  	var sa *streamAssignment
  1042  	if as := cc.streams[account]; as != nil {
  1043  		sa = as[stream]
  1044  	}
  1045  	if sa == nil {
  1046  		return false
  1047  	}
  1048  	// Check if we are the leader of this raftGroup assigned to this consumer.
  1049  	ca := sa.consumers[consumer]
  1050  	if ca == nil {
  1051  		return false
  1052  	}
  1053  	rg := ca.Group
  1054  	ourID := cc.meta.ID()
  1055  	for _, peer := range rg.Peers {
  1056  		if peer == ourID {
  1057  			if len(rg.Peers) == 1 || (rg.node != nil && rg.node.Leader()) {
  1058  				return true
  1059  			}
  1060  		}
  1061  	}
  1062  	return false
  1063  }
  1064  
  1065  // Remove the stream `streamName` for the account `accName` from the inflight
  1066  // proposals map. This is done on success (processStreamAssignment) or on
  1067  // failure (processStreamAssignmentResults).
  1068  // (Write) Lock held on entry.
  1069  func (cc *jetStreamCluster) removeInflightProposal(accName, streamName string) {
  1070  	streams, ok := cc.inflight[accName]
  1071  	if !ok {
  1072  		return
  1073  	}
  1074  	delete(streams, streamName)
  1075  	if len(streams) == 0 {
  1076  		delete(cc.inflight, accName)
  1077  	}
  1078  }
  1079  
  1080  // Return the cluster quit chan.
  1081  func (js *jetStream) clusterQuitC() chan struct{} {
  1082  	js.mu.RLock()
  1083  	defer js.mu.RUnlock()
  1084  	if js.cluster != nil {
  1085  		return js.cluster.qch
  1086  	}
  1087  	return nil
  1088  }
  1089  
  1090  // Mark that the meta layer is recovering.
  1091  func (js *jetStream) setMetaRecovering() {
  1092  	js.mu.Lock()
  1093  	defer js.mu.Unlock()
  1094  	if js.cluster != nil {
  1095  		// metaRecovering
  1096  		js.metaRecovering = true
  1097  	}
  1098  }
  1099  
  1100  // Mark that the meta layer is no longer recovering.
  1101  func (js *jetStream) clearMetaRecovering() {
  1102  	js.mu.Lock()
  1103  	defer js.mu.Unlock()
  1104  	js.metaRecovering = false
  1105  }
  1106  
  1107  // Return whether the meta layer is recovering.
  1108  func (js *jetStream) isMetaRecovering() bool {
  1109  	js.mu.RLock()
  1110  	defer js.mu.RUnlock()
  1111  	return js.metaRecovering
  1112  }
  1113  
  1114  // During recovery track any stream and consumer delete and update operations.
  1115  type recoveryUpdates struct {
  1116  	removeStreams   map[string]*streamAssignment
  1117  	removeConsumers map[string]*consumerAssignment
  1118  	updateStreams   map[string]*streamAssignment
  1119  	updateConsumers map[string]*consumerAssignment
  1120  }
  1121  
  1122  // Called after recovery of the cluster on startup to check for any orphans.
  1123  // Streams and consumers are recovered from disk, and the meta layer's mappings
  1124  // should clean them up, but under crash scenarios there could be orphans.
  1125  func (js *jetStream) checkForOrphans() {
  1126  	consumerName := func(o *consumer) string {
  1127  		o.mu.RLock()
  1128  		defer o.mu.RUnlock()
  1129  		return o.name
  1130  	}
  1131  
  1132  	// Can not hold jetstream lock while trying to delete streams or consumers.
  1133  	js.mu.Lock()
  1134  	s, cc := js.srv, js.cluster
  1135  	s.Debugf("JetStream cluster checking for orphans")
  1136  
  1137  	var streams []*stream
  1138  	var consumers []*consumer
  1139  
  1140  	for accName, jsa := range js.accounts {
  1141  		asa := cc.streams[accName]
  1142  		jsa.mu.RLock()
  1143  		for stream, mset := range jsa.streams {
  1144  			if sa := asa[stream]; sa == nil {
  1145  				streams = append(streams, mset)
  1146  			} else {
  1147  				// This one is good, check consumers now.
  1148  				for _, o := range mset.getConsumers() {
  1149  					consumer := consumerName(o)
  1150  					if sa.consumers[consumer] == nil {
  1151  						consumers = append(consumers, o)
  1152  					}
  1153  				}
  1154  			}
  1155  		}
  1156  		jsa.mu.RUnlock()
  1157  	}
  1158  	js.mu.Unlock()
  1159  
  1160  	for _, mset := range streams {
  1161  		mset.mu.RLock()
  1162  		accName, stream := mset.acc.Name, mset.cfg.Name
  1163  		mset.mu.RUnlock()
  1164  		s.Warnf("Detected orphaned stream '%s > %s', will cleanup", accName, stream)
  1165  		if err := mset.delete(); err != nil {
  1166  			s.Warnf("Deleting stream encountered an error: %v", err)
  1167  		}
  1168  	}
  1169  	for _, o := range consumers {
  1170  		o.mu.RLock()
  1171  		accName, mset, consumer := o.acc.Name, o.mset, o.name
  1172  		o.mu.RUnlock()
  1173  		stream := "N/A"
  1174  		if mset != nil {
  1175  			mset.mu.RLock()
  1176  			stream = mset.cfg.Name
  1177  			mset.mu.RUnlock()
  1178  		}
  1179  		s.Warnf("Detected orphaned consumer '%s > %s > %s', will cleanup", accName, stream, consumer)
  1180  		if err := o.delete(); err != nil {
  1181  			s.Warnf("Deleting consumer encountered an error: %v", err)
  1182  		}
  1183  	}
  1184  }
  1185  
  1186  // Check and delete any orphans we may come across.
  1187  func (s *Server) checkForNRGOrphans() {
  1188  	js, cc := s.getJetStreamCluster()
  1189  	if js == nil || cc == nil || js.isMetaRecovering() {
  1190  		// No cluster means no NRGs. Also return if still recovering.
  1191  		return
  1192  	}
  1193  
  1194  	// Track which assets R>1 should be on this server.
  1195  	nrgMap := make(map[string]struct{})
  1196  	trackGroup := func(rg *raftGroup) {
  1197  		// If R>1 track this as a legit NRG.
  1198  		if rg.node != nil {
  1199  			nrgMap[rg.Name] = struct{}{}
  1200  		}
  1201  	}
  1202  	// Register our meta.
  1203  	js.mu.RLock()
  1204  	meta := cc.meta
  1205  	if meta == nil {
  1206  		js.mu.RUnlock()
  1207  		// Bail with no meta node.
  1208  		return
  1209  	}
  1210  
  1211  	ourID := meta.ID()
  1212  	nrgMap[meta.Group()] = struct{}{}
  1213  
  1214  	// Collect all valid groups from our assignments.
  1215  	for _, asa := range cc.streams {
  1216  		for _, sa := range asa {
  1217  			if sa.Group.isMember(ourID) && sa.Restore == nil {
  1218  				trackGroup(sa.Group)
  1219  				for _, ca := range sa.consumers {
  1220  					if ca.Group.isMember(ourID) {
  1221  						trackGroup(ca.Group)
  1222  					}
  1223  				}
  1224  			}
  1225  		}
  1226  	}
  1227  	js.mu.RUnlock()
  1228  
  1229  	// Check NRGs that are running.
  1230  	var needDelete []RaftNode
  1231  	s.rnMu.RLock()
  1232  	for name, n := range s.raftNodes {
  1233  		if _, ok := nrgMap[name]; !ok {
  1234  			needDelete = append(needDelete, n)
  1235  		}
  1236  	}
  1237  	s.rnMu.RUnlock()
  1238  
  1239  	for _, n := range needDelete {
  1240  		s.Warnf("Detected orphaned NRG %q, will cleanup", n.Group())
  1241  		n.Delete()
  1242  	}
  1243  }
  1244  
  1245  func (js *jetStream) monitorCluster() {
  1246  	s, n := js.server(), js.getMetaGroup()
  1247  	qch, rqch, lch, aq := js.clusterQuitC(), n.QuitC(), n.LeadChangeC(), n.ApplyQ()
  1248  
  1249  	defer s.grWG.Done()
  1250  
  1251  	s.Debugf("Starting metadata monitor")
  1252  	defer s.Debugf("Exiting metadata monitor")
  1253  
  1254  	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
  1255  	defer n.Stop()
  1256  	defer s.isMetaLeader.Store(false)
  1257  
  1258  	const compactInterval = time.Minute
  1259  	t := time.NewTicker(compactInterval)
  1260  	defer t.Stop()
  1261  
  1262  	// Used to check cold boot cluster when possibly in mixed mode.
  1263  	const leaderCheckInterval = time.Second
  1264  	lt := time.NewTicker(leaderCheckInterval)
  1265  	defer lt.Stop()
  1266  
  1267  	// Check the general health once an hour.
  1268  	const healthCheckInterval = 1 * time.Hour
  1269  	ht := time.NewTicker(healthCheckInterval)
  1270  	defer ht.Stop()
  1271  
  1272  	// Utility to check health.
  1273  	checkHealth := func() {
  1274  		if hs := s.healthz(nil); hs.Error != _EMPTY_ {
  1275  			s.Warnf("%v", hs.Error)
  1276  		}
  1277  		// Also check for orphaned NRGs.
  1278  		s.checkForNRGOrphans()
  1279  	}
  1280  
  1281  	var (
  1282  		isLeader       bool
  1283  		lastSnapTime   time.Time
  1284  		compactSizeMin = uint64(8 * 1024 * 1024) // 8MB
  1285  		minSnapDelta   = 10 * time.Second
  1286  	)
  1287  
  1288  	// Highwayhash key for generating hashes.
  1289  	key := make([]byte, 32)
  1290  	crand.Read(key)
  1291  
  1292  	// Set to true to start.
  1293  	js.setMetaRecovering()
  1294  
  1295  	// Snapshotting function.
  1296  	doSnapshot := func() {
  1297  		// Suppress during recovery.
  1298  		if js.isMetaRecovering() {
  1299  			return
  1300  		}
  1301  		// For the meta layer we want to snapshot when asked if we need one or have any entries that we can compact.
  1302  		if ne, _ := n.Size(); ne > 0 || n.NeedSnapshot() {
  1303  			if err := n.InstallSnapshot(js.metaSnapshot()); err == nil {
  1304  				lastSnapTime = time.Now()
  1305  			} else if err != errNoSnapAvailable && err != errNodeClosed {
  1306  				s.Warnf("Error snapshotting JetStream cluster state: %v", err)
  1307  			}
  1308  		}
  1309  	}
  1310  
  1311  	ru := &recoveryUpdates{
  1312  		removeStreams:   make(map[string]*streamAssignment),
  1313  		removeConsumers: make(map[string]*consumerAssignment),
  1314  		updateStreams:   make(map[string]*streamAssignment),
  1315  		updateConsumers: make(map[string]*consumerAssignment),
  1316  	}
  1317  
  1318  	for {
  1319  		select {
  1320  		case <-s.quitCh:
  1321  			return
  1322  		case <-rqch:
  1323  			return
  1324  		case <-qch:
  1325  			// Clean signal from shutdown routine so do best effort attempt to snapshot meta layer.
  1326  			doSnapshot()
  1327  			// Return the signal back since shutdown will be waiting.
  1328  			close(qch)
  1329  			return
  1330  		case <-aq.ch:
  1331  			ces := aq.pop()
  1332  			for _, ce := range ces {
  1333  				if ce == nil {
  1334  					// Signals we have replayed all of our metadata.
  1335  					js.clearMetaRecovering()
  1336  					// Process any removes that are still valid after recovery.
  1337  					for _, ca := range ru.removeConsumers {
  1338  						js.processConsumerRemoval(ca)
  1339  					}
  1340  					for _, sa := range ru.removeStreams {
  1341  						js.processStreamRemoval(sa)
  1342  					}
  1343  					// Process pending updates.
  1344  					for _, sa := range ru.updateStreams {
  1345  						js.processUpdateStreamAssignment(sa)
  1346  					}
  1347  					// Now consumers.
  1348  					for _, ca := range ru.updateConsumers {
  1349  						js.processConsumerAssignment(ca)
  1350  					}
  1351  					// Clear.
  1352  					ru = nil
  1353  					s.Debugf("Recovered JetStream cluster metadata")
  1354  					js.checkForOrphans()
  1355  					// Do a health check here as well.
  1356  					go checkHealth()
  1357  					continue
  1358  				}
  1359  				if didSnap, didStreamRemoval, didConsumerRemoval, err := js.applyMetaEntries(ce.Entries, ru); err == nil {
  1360  					_, nb := n.Applied(ce.Index)
  1361  					if js.hasPeerEntries(ce.Entries) || didStreamRemoval || (didSnap && !isLeader) {
  1362  						doSnapshot()
  1363  					} else if didConsumerRemoval && time.Since(lastSnapTime) > minSnapDelta/2 {
  1364  						doSnapshot()
  1365  					} else if nb > compactSizeMin && time.Since(lastSnapTime) > minSnapDelta {
  1366  						doSnapshot()
  1367  					}
  1368  					ce.ReturnToPool()
  1369  				} else {
  1370  					s.Warnf("Error applying JetStream cluster entries: %v", err)
  1371  				}
  1372  			}
  1373  			aq.recycle(&ces)
  1374  
  1375  		case isLeader = <-lch:
  1376  			// For meta layer synchronize everyone to our state on becoming leader.
  1377  			if isLeader && n.ApplyQ().len() == 0 {
  1378  				n.SendSnapshot(js.metaSnapshot())
  1379  			}
  1380  			// Process the change.
  1381  			js.processLeaderChange(isLeader)
  1382  			if isLeader {
  1383  				s.sendInternalMsgLocked(serverStatsPingReqSubj, _EMPTY_, nil, nil)
  1384  				// Install a snapshot as we become leader.
  1385  				js.checkClusterSize()
  1386  				doSnapshot()
  1387  			}
  1388  
  1389  		case <-t.C:
  1390  			doSnapshot()
  1391  			// Periodically check the cluster size.
  1392  			if n.Leader() {
  1393  				js.checkClusterSize()
  1394  			}
  1395  		case <-ht.C:
  1396  			// Do this in a separate go routine.
  1397  			go checkHealth()
  1398  
  1399  		case <-lt.C:
  1400  			s.Debugf("Checking JetStream cluster state")
  1401  			// If we have a current leader or had one in the past we can cancel this here since the metaleader
  1402  			// will be in charge of all peer state changes.
  1403  			// For cold boot only.
  1404  			if n.GroupLeader() != _EMPTY_ || n.HadPreviousLeader() {
  1405  				lt.Stop()
  1406  				continue
  1407  			}
  1408  			// If we are here we do not have a leader and we did not have a previous one, so cold start.
  1409  			// Check to see if we can adjust our cluster size down iff we are in mixed mode and we have
  1410  			// seen a total that is what our original estimate was.
  1411  			cs := n.ClusterSize()
  1412  			if js, total := s.trackedJetStreamServers(); js < total && total >= cs && js != cs {
  1413  				s.Noticef("Adjusting JetStream expected peer set size to %d from original %d", js, cs)
  1414  				n.AdjustBootClusterSize(js)
  1415  			}
  1416  		}
  1417  	}
  1418  }
  1419  
  1420  // This is called on first leader transition to double check the peers and cluster set size.
  1421  func (js *jetStream) checkClusterSize() {
  1422  	s, n := js.server(), js.getMetaGroup()
  1423  	if n == nil {
  1424  		return
  1425  	}
  1426  	// We will check that we have a correct cluster set size by checking for any non-js servers
  1427  	// which can happen in mixed mode.
  1428  	ps := n.(*raft).currentPeerState()
  1429  	if len(ps.knownPeers) >= ps.clusterSize {
  1430  		return
  1431  	}
  1432  
  1433  	// Grab our active peers.
  1434  	peers := s.ActivePeers()
  1435  
  1436  	// If we have not registered all of our peers yet we can't do
  1437  	// any adjustments based on a mixed mode. We will periodically check back.
  1438  	if len(peers) < ps.clusterSize {
  1439  		return
  1440  	}
  1441  
  1442  	s.Debugf("Checking JetStream cluster size")
  1443  
  1444  	// If we are here our known set as the leader is not the same as the cluster size.
  1445  	// Check to see if we have a mixed mode setup.
  1446  	var totalJS int
  1447  	for _, p := range peers {
  1448  		if si, ok := s.nodeToInfo.Load(p); ok && si != nil {
  1449  			if si.(nodeInfo).js {
  1450  				totalJS++
  1451  			}
  1452  		}
  1453  	}
  1454  	// If we have less then our cluster size adjust that here. Can not do individual peer removals since
  1455  	// they will not be in the tracked peers.
  1456  	if totalJS < ps.clusterSize {
  1457  		s.Debugf("Adjusting JetStream cluster size from %d to %d", ps.clusterSize, totalJS)
  1458  		if err := n.AdjustClusterSize(totalJS); err != nil {
  1459  			s.Warnf("Error adjusting JetStream cluster size: %v", err)
  1460  		}
  1461  	}
  1462  }
  1463  
  1464  // Represents our stable meta state that we can write out.
  1465  type writeableStreamAssignment struct {
  1466  	Client    *ClientInfo   `json:"client,omitempty"`
  1467  	Created   time.Time     `json:"created"`
  1468  	Config    *StreamConfig `json:"stream"`
  1469  	Group     *raftGroup    `json:"group"`
  1470  	Sync      string        `json:"sync"`
  1471  	Consumers []*consumerAssignment
  1472  }
  1473  
  1474  func (js *jetStream) clusterStreamConfig(accName, streamName string) (StreamConfig, bool) {
  1475  	js.mu.RLock()
  1476  	defer js.mu.RUnlock()
  1477  	if sa, ok := js.cluster.streams[accName][streamName]; ok {
  1478  		return *sa.Config, true
  1479  	}
  1480  	return StreamConfig{}, false
  1481  }
  1482  
  1483  func (js *jetStream) metaSnapshot() []byte {
  1484  	js.mu.RLock()
  1485  	cc := js.cluster
  1486  	nsa := 0
  1487  	for _, asa := range cc.streams {
  1488  		nsa += len(asa)
  1489  	}
  1490  	streams := make([]writeableStreamAssignment, 0, nsa)
  1491  	for _, asa := range cc.streams {
  1492  		for _, sa := range asa {
  1493  			wsa := writeableStreamAssignment{
  1494  				Client:    sa.Client,
  1495  				Created:   sa.Created,
  1496  				Config:    sa.Config,
  1497  				Group:     sa.Group,
  1498  				Sync:      sa.Sync,
  1499  				Consumers: make([]*consumerAssignment, 0, len(sa.consumers)),
  1500  			}
  1501  			for _, ca := range sa.consumers {
  1502  				wsa.Consumers = append(wsa.Consumers, ca)
  1503  			}
  1504  			streams = append(streams, wsa)
  1505  		}
  1506  	}
  1507  
  1508  	if len(streams) == 0 {
  1509  		js.mu.RUnlock()
  1510  		return nil
  1511  	}
  1512  
  1513  	b, _ := json.Marshal(streams)
  1514  	js.mu.RUnlock()
  1515  
  1516  	return s2.EncodeBetter(nil, b)
  1517  }
  1518  
  1519  func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecovering bool) error {
  1520  	var wsas []writeableStreamAssignment
  1521  	if len(buf) > 0 {
  1522  		jse, err := s2.Decode(nil, buf)
  1523  		if err != nil {
  1524  			return err
  1525  		}
  1526  		if err = json.Unmarshal(jse, &wsas); err != nil {
  1527  			return err
  1528  		}
  1529  	}
  1530  
  1531  	// Build our new version here outside of js.
  1532  	streams := make(map[string]map[string]*streamAssignment)
  1533  	for _, wsa := range wsas {
  1534  		fixCfgMirrorWithDedupWindow(wsa.Config)
  1535  		as := streams[wsa.Client.serviceAccount()]
  1536  		if as == nil {
  1537  			as = make(map[string]*streamAssignment)
  1538  			streams[wsa.Client.serviceAccount()] = as
  1539  		}
  1540  		sa := &streamAssignment{Client: wsa.Client, Created: wsa.Created, Config: wsa.Config, Group: wsa.Group, Sync: wsa.Sync}
  1541  		if len(wsa.Consumers) > 0 {
  1542  			sa.consumers = make(map[string]*consumerAssignment)
  1543  			for _, ca := range wsa.Consumers {
  1544  				sa.consumers[ca.Name] = ca
  1545  			}
  1546  		}
  1547  		as[wsa.Config.Name] = sa
  1548  	}
  1549  
  1550  	js.mu.Lock()
  1551  	cc := js.cluster
  1552  
  1553  	var saAdd, saDel, saChk []*streamAssignment
  1554  	// Walk through the old list to generate the delete list.
  1555  	for account, asa := range cc.streams {
  1556  		nasa := streams[account]
  1557  		for sn, sa := range asa {
  1558  			if nsa := nasa[sn]; nsa == nil {
  1559  				saDel = append(saDel, sa)
  1560  			} else {
  1561  				saChk = append(saChk, nsa)
  1562  			}
  1563  		}
  1564  	}
  1565  	// Walk through the new list to generate the add list.
  1566  	for account, nasa := range streams {
  1567  		asa := cc.streams[account]
  1568  		for sn, sa := range nasa {
  1569  			if asa[sn] == nil {
  1570  				saAdd = append(saAdd, sa)
  1571  			}
  1572  		}
  1573  	}
  1574  	// Now walk the ones to check and process consumers.
  1575  	var caAdd, caDel []*consumerAssignment
  1576  	for _, sa := range saChk {
  1577  		// Make sure to add in all the new ones from sa.
  1578  		for _, ca := range sa.consumers {
  1579  			caAdd = append(caAdd, ca)
  1580  		}
  1581  		if osa := js.streamAssignment(sa.Client.serviceAccount(), sa.Config.Name); osa != nil {
  1582  			for _, ca := range osa.consumers {
  1583  				if sa.consumers[ca.Name] == nil {
  1584  					caDel = append(caDel, ca)
  1585  				} else {
  1586  					caAdd = append(caAdd, ca)
  1587  				}
  1588  			}
  1589  		}
  1590  	}
  1591  	js.mu.Unlock()
  1592  
  1593  	// Do removals first.
  1594  	for _, sa := range saDel {
  1595  		js.setStreamAssignmentRecovering(sa)
  1596  		if isRecovering {
  1597  			key := sa.recoveryKey()
  1598  			ru.removeStreams[key] = sa
  1599  			delete(ru.updateStreams, key)
  1600  		} else {
  1601  			js.processStreamRemoval(sa)
  1602  		}
  1603  	}
  1604  	// Now do add for the streams. Also add in all consumers.
  1605  	for _, sa := range saAdd {
  1606  		js.setStreamAssignmentRecovering(sa)
  1607  		js.processStreamAssignment(sa)
  1608  
  1609  		// We can simply process the consumers.
  1610  		for _, ca := range sa.consumers {
  1611  			js.setConsumerAssignmentRecovering(ca)
  1612  			js.processConsumerAssignment(ca)
  1613  		}
  1614  	}
  1615  
  1616  	// Perform updates on those in saChk. These were existing so make
  1617  	// sure to process any changes.
  1618  	for _, sa := range saChk {
  1619  		js.setStreamAssignmentRecovering(sa)
  1620  		if isRecovering {
  1621  			key := sa.recoveryKey()
  1622  			ru.updateStreams[key] = sa
  1623  			delete(ru.removeStreams, key)
  1624  		} else {
  1625  			js.processUpdateStreamAssignment(sa)
  1626  		}
  1627  	}
  1628  
  1629  	// Now do the deltas for existing stream's consumers.
  1630  	for _, ca := range caDel {
  1631  		js.setConsumerAssignmentRecovering(ca)
  1632  		if isRecovering {
  1633  			key := ca.recoveryKey()
  1634  			ru.removeConsumers[key] = ca
  1635  			delete(ru.updateConsumers, key)
  1636  		} else {
  1637  			js.processConsumerRemoval(ca)
  1638  		}
  1639  	}
  1640  	for _, ca := range caAdd {
  1641  		js.setConsumerAssignmentRecovering(ca)
  1642  		if isRecovering {
  1643  			key := ca.recoveryKey()
  1644  			delete(ru.removeConsumers, key)
  1645  			ru.updateConsumers[key] = ca
  1646  		} else {
  1647  			js.processConsumerAssignment(ca)
  1648  		}
  1649  	}
  1650  
  1651  	return nil
  1652  }
  1653  
  1654  // Called on recovery to make sure we do not process like original.
  1655  func (js *jetStream) setStreamAssignmentRecovering(sa *streamAssignment) {
  1656  	js.mu.Lock()
  1657  	defer js.mu.Unlock()
  1658  	sa.responded = true
  1659  	sa.recovering = true
  1660  	sa.Restore = nil
  1661  	if sa.Group != nil {
  1662  		sa.Group.Preferred = _EMPTY_
  1663  	}
  1664  }
  1665  
  1666  // Called on recovery to make sure we do not process like original.
  1667  func (js *jetStream) setConsumerAssignmentRecovering(ca *consumerAssignment) {
  1668  	js.mu.Lock()
  1669  	defer js.mu.Unlock()
  1670  	ca.responded = true
  1671  	ca.recovering = true
  1672  	if ca.Group != nil {
  1673  		ca.Group.Preferred = _EMPTY_
  1674  	}
  1675  }
  1676  
  1677  // Just copies over and changes out the group so it can be encoded.
  1678  // Lock should be held.
  1679  func (sa *streamAssignment) copyGroup() *streamAssignment {
  1680  	csa, cg := *sa, *sa.Group
  1681  	csa.Group = &cg
  1682  	csa.Group.Peers = copyStrings(sa.Group.Peers)
  1683  	return &csa
  1684  }
  1685  
  1686  // Just copies over and changes out the group so it can be encoded.
  1687  // Lock should be held.
  1688  func (ca *consumerAssignment) copyGroup() *consumerAssignment {
  1689  	cca, cg := *ca, *ca.Group
  1690  	cca.Group = &cg
  1691  	cca.Group.Peers = copyStrings(ca.Group.Peers)
  1692  	return &cca
  1693  }
  1694  
  1695  // Lock should be held.
  1696  func (sa *streamAssignment) missingPeers() bool {
  1697  	return len(sa.Group.Peers) < sa.Config.Replicas
  1698  }
  1699  
  1700  // Called when we detect a new peer. Only the leader will process checking
  1701  // for any streams, and consequently any consumers.
  1702  func (js *jetStream) processAddPeer(peer string) {
  1703  	js.mu.Lock()
  1704  	defer js.mu.Unlock()
  1705  
  1706  	s, cc := js.srv, js.cluster
  1707  	if cc == nil || cc.meta == nil {
  1708  		return
  1709  	}
  1710  	isLeader := cc.isLeader()
  1711  
  1712  	// Now check if we are meta-leader. We will check for any re-assignments.
  1713  	if !isLeader {
  1714  		return
  1715  	}
  1716  
  1717  	sir, ok := s.nodeToInfo.Load(peer)
  1718  	if !ok || sir == nil {
  1719  		return
  1720  	}
  1721  	si := sir.(nodeInfo)
  1722  
  1723  	for _, asa := range cc.streams {
  1724  		for _, sa := range asa {
  1725  			if sa.missingPeers() {
  1726  				// Make sure the right cluster etc.
  1727  				if si.cluster != sa.Client.Cluster {
  1728  					continue
  1729  				}
  1730  				// If we are here we can add in this peer.
  1731  				csa := sa.copyGroup()
  1732  				csa.Group.Peers = append(csa.Group.Peers, peer)
  1733  				// Send our proposal for this csa. Also use same group definition for all the consumers as well.
  1734  				cc.meta.Propose(encodeAddStreamAssignment(csa))
  1735  				for _, ca := range sa.consumers {
  1736  					// Ephemerals are R=1, so only auto-remap durables, or R>1.
  1737  					if ca.Config.Durable != _EMPTY_ || len(ca.Group.Peers) > 1 {
  1738  						cca := ca.copyGroup()
  1739  						cca.Group.Peers = csa.Group.Peers
  1740  						cc.meta.Propose(encodeAddConsumerAssignment(cca))
  1741  					}
  1742  				}
  1743  			}
  1744  		}
  1745  	}
  1746  }
  1747  
  1748  func (js *jetStream) processRemovePeer(peer string) {
  1749  	// We may be already disabled.
  1750  	if js == nil || js.disabled.Load() {
  1751  		return
  1752  	}
  1753  
  1754  	js.mu.Lock()
  1755  	s, cc := js.srv, js.cluster
  1756  	if cc == nil || cc.meta == nil {
  1757  		js.mu.Unlock()
  1758  		return
  1759  	}
  1760  	isLeader := cc.isLeader()
  1761  	// All nodes will check if this is them.
  1762  	isUs := cc.meta.ID() == peer
  1763  	js.mu.Unlock()
  1764  
  1765  	if isUs {
  1766  		s.Errorf("JetStream being DISABLED, our server was removed from the cluster")
  1767  		adv := &JSServerRemovedAdvisory{
  1768  			TypedEvent: TypedEvent{
  1769  				Type: JSServerRemovedAdvisoryType,
  1770  				ID:   nuid.Next(),
  1771  				Time: time.Now().UTC(),
  1772  			},
  1773  			Server:   s.Name(),
  1774  			ServerID: s.ID(),
  1775  			Cluster:  s.cachedClusterName(),
  1776  			Domain:   s.getOpts().JetStreamDomain,
  1777  		}
  1778  		s.publishAdvisory(nil, JSAdvisoryServerRemoved, adv)
  1779  
  1780  		go s.DisableJetStream()
  1781  	}
  1782  
  1783  	// Now check if we are meta-leader. We will attempt re-assignment.
  1784  	if !isLeader {
  1785  		return
  1786  	}
  1787  
  1788  	js.mu.Lock()
  1789  	defer js.mu.Unlock()
  1790  
  1791  	for _, asa := range cc.streams {
  1792  		for _, sa := range asa {
  1793  			if rg := sa.Group; rg.isMember(peer) {
  1794  				js.removePeerFromStreamLocked(sa, peer)
  1795  			}
  1796  		}
  1797  	}
  1798  }
  1799  
  1800  // Assumes all checks have already been done.
  1801  func (js *jetStream) removePeerFromStream(sa *streamAssignment, peer string) bool {
  1802  	js.mu.Lock()
  1803  	defer js.mu.Unlock()
  1804  	return js.removePeerFromStreamLocked(sa, peer)
  1805  }
  1806  
  1807  // Lock should be held.
  1808  func (js *jetStream) removePeerFromStreamLocked(sa *streamAssignment, peer string) bool {
  1809  	if rg := sa.Group; !rg.isMember(peer) {
  1810  		return false
  1811  	}
  1812  
  1813  	s, cc, csa := js.srv, js.cluster, sa.copyGroup()
  1814  	if cc == nil || cc.meta == nil {
  1815  		return false
  1816  	}
  1817  	replaced := cc.remapStreamAssignment(csa, peer)
  1818  	if !replaced {
  1819  		s.Warnf("JetStream cluster could not replace peer for stream '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name)
  1820  	}
  1821  
  1822  	// Send our proposal for this csa. Also use same group definition for all the consumers as well.
  1823  	cc.meta.Propose(encodeAddStreamAssignment(csa))
  1824  	rg := csa.Group
  1825  	for _, ca := range sa.consumers {
  1826  		// Ephemerals are R=1, so only auto-remap durables, or R>1.
  1827  		if ca.Config.Durable != _EMPTY_ {
  1828  			cca := ca.copyGroup()
  1829  			cca.Group.Peers, cca.Group.Preferred = rg.Peers, _EMPTY_
  1830  			cc.meta.Propose(encodeAddConsumerAssignment(cca))
  1831  		} else if ca.Group.isMember(peer) {
  1832  			// These are ephemerals. Check to see if we deleted this peer.
  1833  			cc.meta.Propose(encodeDeleteConsumerAssignment(ca))
  1834  		}
  1835  	}
  1836  	return replaced
  1837  }
  1838  
  1839  // Check if we have peer related entries.
  1840  func (js *jetStream) hasPeerEntries(entries []*Entry) bool {
  1841  	for _, e := range entries {
  1842  		if e.Type == EntryRemovePeer || e.Type == EntryAddPeer {
  1843  			return true
  1844  		}
  1845  	}
  1846  	return false
  1847  }
  1848  
  1849  const ksep = ":"
  1850  
  1851  func (sa *streamAssignment) recoveryKey() string {
  1852  	if sa == nil {
  1853  		return _EMPTY_
  1854  	}
  1855  	return sa.Client.serviceAccount() + ksep + sa.Config.Name
  1856  }
  1857  
  1858  func (ca *consumerAssignment) recoveryKey() string {
  1859  	if ca == nil {
  1860  		return _EMPTY_
  1861  	}
  1862  	return ca.Client.serviceAccount() + ksep + ca.Stream + ksep + ca.Name
  1863  }
  1864  
  1865  func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bool, bool, bool, error) {
  1866  	var didSnap, didRemoveStream, didRemoveConsumer bool
  1867  	isRecovering := js.isMetaRecovering()
  1868  
  1869  	for _, e := range entries {
  1870  		if e.Type == EntrySnapshot {
  1871  			js.applyMetaSnapshot(e.Data, ru, isRecovering)
  1872  			didSnap = true
  1873  		} else if e.Type == EntryRemovePeer {
  1874  			if !isRecovering {
  1875  				js.processRemovePeer(string(e.Data))
  1876  			}
  1877  		} else if e.Type == EntryAddPeer {
  1878  			if !isRecovering {
  1879  				js.processAddPeer(string(e.Data))
  1880  			}
  1881  		} else {
  1882  			buf := e.Data
  1883  			switch entryOp(buf[0]) {
  1884  			case assignStreamOp:
  1885  				sa, err := decodeStreamAssignment(buf[1:])
  1886  				if err != nil {
  1887  					js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:])
  1888  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1889  				}
  1890  				if isRecovering {
  1891  					js.setStreamAssignmentRecovering(sa)
  1892  					delete(ru.removeStreams, sa.recoveryKey())
  1893  				}
  1894  				if js.processStreamAssignment(sa) {
  1895  					didRemoveStream = true
  1896  				}
  1897  			case removeStreamOp:
  1898  				sa, err := decodeStreamAssignment(buf[1:])
  1899  				if err != nil {
  1900  					js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:])
  1901  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1902  				}
  1903  				if isRecovering {
  1904  					js.setStreamAssignmentRecovering(sa)
  1905  					key := sa.recoveryKey()
  1906  					ru.removeStreams[key] = sa
  1907  					delete(ru.updateStreams, key)
  1908  				} else {
  1909  					js.processStreamRemoval(sa)
  1910  					didRemoveStream = true
  1911  				}
  1912  			case assignConsumerOp:
  1913  				ca, err := decodeConsumerAssignment(buf[1:])
  1914  				if err != nil {
  1915  					js.srv.Errorf("JetStream cluster failed to decode consumer assignment: %q", buf[1:])
  1916  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1917  				}
  1918  				if isRecovering {
  1919  					js.setConsumerAssignmentRecovering(ca)
  1920  					key := ca.recoveryKey()
  1921  					delete(ru.removeConsumers, key)
  1922  					ru.updateConsumers[key] = ca
  1923  				} else {
  1924  					js.processConsumerAssignment(ca)
  1925  				}
  1926  			case assignCompressedConsumerOp:
  1927  				ca, err := decodeConsumerAssignmentCompressed(buf[1:])
  1928  				if err != nil {
  1929  					js.srv.Errorf("JetStream cluster failed to decode compressed consumer assignment: %q", buf[1:])
  1930  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1931  				}
  1932  				if isRecovering {
  1933  					js.setConsumerAssignmentRecovering(ca)
  1934  					key := ca.recoveryKey()
  1935  					delete(ru.removeConsumers, key)
  1936  					ru.updateConsumers[key] = ca
  1937  				} else {
  1938  					js.processConsumerAssignment(ca)
  1939  				}
  1940  			case removeConsumerOp:
  1941  				ca, err := decodeConsumerAssignment(buf[1:])
  1942  				if err != nil {
  1943  					js.srv.Errorf("JetStream cluster failed to decode consumer assignment: %q", buf[1:])
  1944  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1945  				}
  1946  				if isRecovering {
  1947  					js.setConsumerAssignmentRecovering(ca)
  1948  					key := ca.recoveryKey()
  1949  					ru.removeConsumers[key] = ca
  1950  					delete(ru.updateConsumers, key)
  1951  				} else {
  1952  					js.processConsumerRemoval(ca)
  1953  					didRemoveConsumer = true
  1954  				}
  1955  			case updateStreamOp:
  1956  				sa, err := decodeStreamAssignment(buf[1:])
  1957  				if err != nil {
  1958  					js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:])
  1959  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1960  				}
  1961  				if isRecovering {
  1962  					js.setStreamAssignmentRecovering(sa)
  1963  					key := sa.recoveryKey()
  1964  					ru.updateStreams[key] = sa
  1965  					delete(ru.removeStreams, key)
  1966  				} else {
  1967  					js.processUpdateStreamAssignment(sa)
  1968  					// Since an update can be lowering replica count, we want upper layer to treat
  1969  					// similar to a removal and snapshot to collapse old entries.
  1970  					didRemoveStream = true
  1971  				}
  1972  			default:
  1973  				panic(fmt.Sprintf("JetStream Cluster Unknown meta entry op type: %v", entryOp(buf[0])))
  1974  			}
  1975  		}
  1976  	}
  1977  	return didSnap, didRemoveStream, didRemoveConsumer, nil
  1978  }
  1979  
  1980  func (rg *raftGroup) isMember(id string) bool {
  1981  	if rg == nil {
  1982  		return false
  1983  	}
  1984  	for _, peer := range rg.Peers {
  1985  		if peer == id {
  1986  			return true
  1987  		}
  1988  	}
  1989  	return false
  1990  }
  1991  
  1992  func (rg *raftGroup) setPreferred() {
  1993  	if rg == nil || len(rg.Peers) == 0 {
  1994  		return
  1995  	}
  1996  	if len(rg.Peers) == 1 {
  1997  		rg.Preferred = rg.Peers[0]
  1998  	} else {
  1999  		// For now just randomly select a peer for the preferred.
  2000  		pi := rand.Int31n(int32(len(rg.Peers)))
  2001  		rg.Preferred = rg.Peers[pi]
  2002  	}
  2003  }
  2004  
  2005  // createRaftGroup is called to spin up this raft group if needed.
  2006  func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, storage StorageType, labels pprofLabels) error {
  2007  	js.mu.Lock()
  2008  	s, cc := js.srv, js.cluster
  2009  	if cc == nil || cc.meta == nil {
  2010  		js.mu.Unlock()
  2011  		return NewJSClusterNotActiveError()
  2012  	}
  2013  
  2014  	// If this is a single peer raft group or we are not a member return.
  2015  	if len(rg.Peers) <= 1 || !rg.isMember(cc.meta.ID()) {
  2016  		js.mu.Unlock()
  2017  		// Nothing to do here.
  2018  		return nil
  2019  	}
  2020  
  2021  	// Check if we already have this assigned.
  2022  	if node := s.lookupRaftNode(rg.Name); node != nil {
  2023  		s.Debugf("JetStream cluster already has raft group %q assigned", rg.Name)
  2024  		rg.node = node
  2025  		js.mu.Unlock()
  2026  		return nil
  2027  	}
  2028  
  2029  	s.Debugf("JetStream cluster creating raft group:%+v", rg)
  2030  	js.mu.Unlock()
  2031  
  2032  	sysAcc := s.SystemAccount()
  2033  	if sysAcc == nil {
  2034  		s.Debugf("JetStream cluster detected shutdown processing raft group: %+v", rg)
  2035  		return errors.New("shutting down")
  2036  	}
  2037  
  2038  	// Check here to see if we have a max HA Assets limit set.
  2039  	if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets > 0 {
  2040  		if s.numRaftNodes() > maxHaAssets {
  2041  			s.Warnf("Maximum HA Assets limit reached: %d", maxHaAssets)
  2042  			// Since the meta leader assigned this, send a statsz update to them to get them up to date.
  2043  			go s.sendStatszUpdate()
  2044  			return errors.New("system limit reached")
  2045  		}
  2046  	}
  2047  
  2048  	storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, rg.Name)
  2049  	var store StreamStore
  2050  	if storage == FileStorage {
  2051  		fs, err := newFileStoreWithCreated(
  2052  			FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncInterval: 5 * time.Minute, srv: s},
  2053  			StreamConfig{Name: rg.Name, Storage: FileStorage, Metadata: labels},
  2054  			time.Now().UTC(),
  2055  			s.jsKeyGen(s.getOpts().JetStreamKey, rg.Name),
  2056  			s.jsKeyGen(s.getOpts().JetStreamOldKey, rg.Name),
  2057  		)
  2058  		if err != nil {
  2059  			s.Errorf("Error creating filestore WAL: %v", err)
  2060  			return err
  2061  		}
  2062  		store = fs
  2063  	} else {
  2064  		ms, err := newMemStore(&StreamConfig{Name: rg.Name, Storage: MemoryStorage})
  2065  		if err != nil {
  2066  			s.Errorf("Error creating memstore WAL: %v", err)
  2067  			return err
  2068  		}
  2069  		store = ms
  2070  	}
  2071  
  2072  	cfg := &RaftConfig{Name: rg.Name, Store: storeDir, Log: store, Track: true}
  2073  
  2074  	if _, err := readPeerState(storeDir); err != nil {
  2075  		s.bootstrapRaftNode(cfg, rg.Peers, true)
  2076  	}
  2077  
  2078  	n, err := s.startRaftNode(accName, cfg, labels)
  2079  	if err != nil || n == nil {
  2080  		s.Debugf("Error creating raft group: %v", err)
  2081  		return err
  2082  	}
  2083  	// Need locking here for the assignment to avoid data-race reports
  2084  	js.mu.Lock()
  2085  	rg.node = n
  2086  	// See if we are preferred and should start campaign immediately.
  2087  	if n.ID() == rg.Preferred && n.Term() == 0 {
  2088  		n.Campaign()
  2089  	}
  2090  	js.mu.Unlock()
  2091  	return nil
  2092  }
  2093  
  2094  func (mset *stream) raftGroup() *raftGroup {
  2095  	if mset == nil {
  2096  		return nil
  2097  	}
  2098  	mset.mu.RLock()
  2099  	defer mset.mu.RUnlock()
  2100  	if mset.sa == nil {
  2101  		return nil
  2102  	}
  2103  	return mset.sa.Group
  2104  }
  2105  
  2106  func (mset *stream) raftNode() RaftNode {
  2107  	if mset == nil {
  2108  		return nil
  2109  	}
  2110  	mset.mu.RLock()
  2111  	defer mset.mu.RUnlock()
  2112  	return mset.node
  2113  }
  2114  
  2115  func (mset *stream) removeNode() {
  2116  	mset.mu.Lock()
  2117  	defer mset.mu.Unlock()
  2118  	if n := mset.node; n != nil {
  2119  		n.Delete()
  2120  		mset.node = nil
  2121  	}
  2122  }
  2123  
  2124  func (mset *stream) clearRaftNode() {
  2125  	if mset == nil {
  2126  		return
  2127  	}
  2128  	mset.mu.Lock()
  2129  	defer mset.mu.Unlock()
  2130  	mset.node = nil
  2131  }
  2132  
  2133  // Helper function to generate peer info.
  2134  // lists and sets for old and new.
  2135  func genPeerInfo(peers []string, split int) (newPeers, oldPeers []string, newPeerSet, oldPeerSet map[string]bool) {
  2136  	newPeers = peers[split:]
  2137  	oldPeers = peers[:split]
  2138  	newPeerSet = make(map[string]bool, len(newPeers))
  2139  	oldPeerSet = make(map[string]bool, len(oldPeers))
  2140  	for i, peer := range peers {
  2141  		if i < split {
  2142  			oldPeerSet[peer] = true
  2143  		} else {
  2144  			newPeerSet[peer] = true
  2145  		}
  2146  	}
  2147  	return
  2148  }
  2149  
  2150  // Monitor our stream node for this stream.
  2151  func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnapshot bool) {
  2152  	s, cc := js.server(), js.cluster
  2153  	defer s.grWG.Done()
  2154  	if mset != nil {
  2155  		defer mset.monitorWg.Done()
  2156  	}
  2157  	js.mu.RLock()
  2158  	n := sa.Group.node
  2159  	meta := cc.meta
  2160  	js.mu.RUnlock()
  2161  
  2162  	if n == nil || meta == nil {
  2163  		s.Warnf("No RAFT group for '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name)
  2164  		return
  2165  	}
  2166  
  2167  	// Make sure only one is running.
  2168  	if mset != nil {
  2169  		if mset.checkInMonitor() {
  2170  			return
  2171  		}
  2172  		defer mset.clearMonitorRunning()
  2173  	}
  2174  
  2175  	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
  2176  	// This should be below the checkInMonitor call though to avoid stopping it out
  2177  	// from underneath the one that is running since it will be the same raft node.
  2178  	defer n.Stop()
  2179  
  2180  	qch, mqch, lch, aq, uch, ourPeerId := n.QuitC(), mset.monitorQuitC(), n.LeadChangeC(), n.ApplyQ(), mset.updateC(), meta.ID()
  2181  
  2182  	s.Debugf("Starting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group())
  2183  	defer s.Debugf("Exiting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group())
  2184  
  2185  	// Make sure we do not leave the apply channel to fill up and block the raft layer.
  2186  	defer func() {
  2187  		if n.State() == Closed {
  2188  			return
  2189  		}
  2190  		if n.Leader() {
  2191  			n.StepDown()
  2192  		}
  2193  		// Drain the commit queue...
  2194  		aq.drain()
  2195  	}()
  2196  
  2197  	const (
  2198  		compactInterval = 2 * time.Minute
  2199  		compactSizeMin  = 8 * 1024 * 1024
  2200  		compactNumMin   = 65536
  2201  		minSnapDelta    = 10 * time.Second
  2202  	)
  2203  
  2204  	// Spread these out for large numbers on server restart.
  2205  	rci := time.Duration(rand.Int63n(int64(time.Minute)))
  2206  	t := time.NewTicker(compactInterval + rci)
  2207  	defer t.Stop()
  2208  
  2209  	js.mu.RLock()
  2210  	isLeader := cc.isStreamLeader(sa.Client.serviceAccount(), sa.Config.Name)
  2211  	isRestore := sa.Restore != nil
  2212  	js.mu.RUnlock()
  2213  
  2214  	acc, err := s.LookupAccount(sa.Client.serviceAccount())
  2215  	if err != nil {
  2216  		s.Warnf("Could not retrieve account for stream '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name)
  2217  		return
  2218  	}
  2219  	accName := acc.GetName()
  2220  
  2221  	// Used to represent how we can detect a changed state quickly and without representing
  2222  	// a complete and detailed state which could be costly in terms of memory, cpu and GC.
  2223  	// This only entails how many messages, and the first and last sequence of the stream.
  2224  	// This is all that is needed to detect a change, and we can get this from FilteredState()
  2225  	// with and empty filter.
  2226  	var lastState SimpleState
  2227  	var lastSnapTime time.Time
  2228  
  2229  	// Don't allow the upper layer to install snapshots until we have
  2230  	// fully recovered from disk.
  2231  	isRecovering := true
  2232  
  2233  	// Should only to be called from leader.
  2234  	doSnapshot := func() {
  2235  		if mset == nil || isRecovering || isRestore || time.Since(lastSnapTime) < minSnapDelta {
  2236  			return
  2237  		}
  2238  
  2239  		// Before we actually calculate the detailed state and encode it, let's check the
  2240  		// simple state to detect any changes.
  2241  		curState := mset.store.FilteredState(0, _EMPTY_)
  2242  
  2243  		// If the state hasn't changed but the log has gone way over
  2244  		// the compaction size then we will want to compact anyway.
  2245  		// This shouldn't happen for streams like it can for pull
  2246  		// consumers on idle streams but better to be safe than sorry!
  2247  		ne, nb := n.Size()
  2248  		if curState == lastState && ne < compactNumMin && nb < compactSizeMin {
  2249  			return
  2250  		}
  2251  
  2252  		if err := n.InstallSnapshot(mset.stateSnapshot()); err == nil {
  2253  			lastState, lastSnapTime = curState, time.Now()
  2254  		} else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning {
  2255  			s.RateLimitWarnf("Failed to install snapshot for '%s > %s' [%s]: %v", mset.acc.Name, mset.name(), n.Group(), err)
  2256  		}
  2257  	}
  2258  
  2259  	// We will establish a restoreDoneCh no matter what. Will never be triggered unless
  2260  	// we replace with the restore chan.
  2261  	restoreDoneCh := make(<-chan error)
  2262  
  2263  	// For migration tracking.
  2264  	var mmt *time.Ticker
  2265  	var mmtc <-chan time.Time
  2266  
  2267  	startMigrationMonitoring := func() {
  2268  		if mmt == nil {
  2269  			mmt = time.NewTicker(500 * time.Millisecond)
  2270  			mmtc = mmt.C
  2271  		}
  2272  	}
  2273  
  2274  	stopMigrationMonitoring := func() {
  2275  		if mmt != nil {
  2276  			mmt.Stop()
  2277  			mmt, mmtc = nil, nil
  2278  		}
  2279  	}
  2280  	defer stopMigrationMonitoring()
  2281  
  2282  	// This is to optionally track when we are ready as a non-leader for direct access participation.
  2283  	// Either direct or if we are a direct mirror, or both.
  2284  	var dat *time.Ticker
  2285  	var datc <-chan time.Time
  2286  
  2287  	startDirectAccessMonitoring := func() {
  2288  		if dat == nil {
  2289  			dat = time.NewTicker(2 * time.Second)
  2290  			datc = dat.C
  2291  		}
  2292  	}
  2293  
  2294  	stopDirectMonitoring := func() {
  2295  		if dat != nil {
  2296  			dat.Stop()
  2297  			dat, datc = nil, nil
  2298  		}
  2299  	}
  2300  	defer stopDirectMonitoring()
  2301  
  2302  	// Check if we are interest based and if so and we have an active stream wait until we
  2303  	// have the consumers attached. This can become important when a server has lots of assets
  2304  	// since we process streams first then consumers as an asset class.
  2305  	if mset != nil && mset.isInterestRetention() {
  2306  		js.mu.RLock()
  2307  		numExpectedConsumers := len(sa.consumers)
  2308  		js.mu.RUnlock()
  2309  		if mset.numConsumers() < numExpectedConsumers {
  2310  			s.Debugf("Waiting for consumers for interest based stream '%s > %s'", accName, mset.name())
  2311  			// Wait up to 10s
  2312  			const maxWaitTime = 10 * time.Second
  2313  			const sleepTime = 250 * time.Millisecond
  2314  			timeout := time.Now().Add(maxWaitTime)
  2315  			for time.Now().Before(timeout) {
  2316  				if mset.numConsumers() >= numExpectedConsumers {
  2317  					break
  2318  				}
  2319  				select {
  2320  				case <-s.quitCh:
  2321  					return
  2322  				case <-time.After(sleepTime):
  2323  				}
  2324  			}
  2325  			if actual := mset.numConsumers(); actual < numExpectedConsumers {
  2326  				s.Warnf("All consumers not online for '%s > %s': expected %d but only have %d", accName, mset.name(), numExpectedConsumers, actual)
  2327  			}
  2328  		}
  2329  	}
  2330  
  2331  	// This is triggered during a scale up from R1 to clustered mode. We need the new followers to catchup,
  2332  	// similar to how we trigger the catchup mechanism post a backup/restore.
  2333  	// We can arrive here NOT being the leader, so we send the snapshot only if we are, and in this case
  2334  	// reset the notion that we need to send the snapshot. If we are not, then the first time the server
  2335  	// will switch to leader (in the loop below), we will send the snapshot.
  2336  	if sendSnapshot && isLeader && mset != nil && n != nil && !isRecovering {
  2337  		n.SendSnapshot(mset.stateSnapshot())
  2338  		sendSnapshot = false
  2339  	}
  2340  
  2341  	for {
  2342  		select {
  2343  		case <-s.quitCh:
  2344  			return
  2345  		case <-mqch:
  2346  			return
  2347  		case <-qch:
  2348  			return
  2349  		case <-aq.ch:
  2350  			var ne, nb uint64
  2351  			ces := aq.pop()
  2352  			for _, ce := range ces {
  2353  				// No special processing needed for when we are caught up on restart.
  2354  				if ce == nil {
  2355  					isRecovering = false
  2356  					// Make sure we create a new snapshot in case things have changed such that any existing
  2357  					// snapshot may no longer be valid.
  2358  					doSnapshot()
  2359  					// If we became leader during this time and we need to send a snapshot to our
  2360  					// followers, i.e. as a result of a scale-up from R1, do it now.
  2361  					if sendSnapshot && isLeader && mset != nil && n != nil {
  2362  						n.SendSnapshot(mset.stateSnapshot())
  2363  						sendSnapshot = false
  2364  					}
  2365  					continue
  2366  				}
  2367  				// Apply our entries.
  2368  				if err := js.applyStreamEntries(mset, ce, isRecovering); err == nil {
  2369  					// Update our applied.
  2370  					ne, nb = n.Applied(ce.Index)
  2371  					ce.ReturnToPool()
  2372  				} else {
  2373  					// Our stream was closed out from underneath of us, simply return here.
  2374  					if err == errStreamClosed {
  2375  						return
  2376  					}
  2377  					s.Warnf("Error applying entries to '%s > %s': %v", accName, sa.Config.Name, err)
  2378  					if isClusterResetErr(err) {
  2379  						if mset.isMirror() && mset.IsLeader() {
  2380  							mset.retryMirrorConsumer()
  2381  							continue
  2382  						}
  2383  						// We will attempt to reset our cluster state.
  2384  						if mset.resetClusteredState(err) {
  2385  							aq.recycle(&ces)
  2386  							return
  2387  						}
  2388  					} else if isOutOfSpaceErr(err) {
  2389  						// If applicable this will tear all of this down, but don't assume so and return.
  2390  						s.handleOutOfSpace(mset)
  2391  					}
  2392  				}
  2393  			}
  2394  			aq.recycle(&ces)
  2395  
  2396  			// Check about snapshotting
  2397  			// If we have at least min entries to compact, go ahead and try to snapshot/compact.
  2398  			if ne >= compactNumMin || nb > compactSizeMin {
  2399  				doSnapshot()
  2400  			}
  2401  
  2402  		case isLeader = <-lch:
  2403  			if isLeader {
  2404  				if mset != nil && n != nil && sendSnapshot && !isRecovering {
  2405  					// If we *are* recovering at the time then this will get done when the apply queue
  2406  					// handles the nil guard to show the catchup ended.
  2407  					n.SendSnapshot(mset.stateSnapshot())
  2408  					sendSnapshot = false
  2409  				}
  2410  				if isRestore {
  2411  					acc, _ := s.LookupAccount(sa.Client.serviceAccount())
  2412  					restoreDoneCh = s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_)
  2413  					continue
  2414  				} else if n != nil && n.NeedSnapshot() {
  2415  					doSnapshot()
  2416  				}
  2417  				// Always cancel if this was running.
  2418  				stopDirectMonitoring()
  2419  
  2420  			} else if n.GroupLeader() != noLeader {
  2421  				js.setStreamAssignmentRecovering(sa)
  2422  			}
  2423  
  2424  			// Process our leader change.
  2425  			js.processStreamLeaderChange(mset, isLeader)
  2426  
  2427  			// We may receive a leader change after the stream assignment which would cancel us
  2428  			// monitoring for this closely. So re-assess our state here as well.
  2429  			// Or the old leader is no longer part of the set and transferred leadership
  2430  			// for this leader to resume with removal
  2431  			migrating := mset.isMigrating()
  2432  
  2433  			// Check for migrations here. We set the state on the stream assignment update below.
  2434  			if isLeader && migrating {
  2435  				startMigrationMonitoring()
  2436  			}
  2437  
  2438  			// Here we are checking if we are not the leader but we have been asked to allow
  2439  			// direct access. We now allow non-leaders to participate in the queue group.
  2440  			if !isLeader && mset != nil {
  2441  				mset.mu.RLock()
  2442  				ad, md := mset.cfg.AllowDirect, mset.cfg.MirrorDirect
  2443  				mset.mu.RUnlock()
  2444  				if ad || md {
  2445  					startDirectAccessMonitoring()
  2446  				}
  2447  			}
  2448  
  2449  		case <-datc:
  2450  			if mset == nil || isRecovering {
  2451  				continue
  2452  			}
  2453  			// If we are leader we can stop, we know this is setup now.
  2454  			if isLeader {
  2455  				stopDirectMonitoring()
  2456  				continue
  2457  			}
  2458  
  2459  			mset.mu.Lock()
  2460  			ad, md, current := mset.cfg.AllowDirect, mset.cfg.MirrorDirect, mset.isCurrent()
  2461  			if !current {
  2462  				const syncThreshold = 90.0
  2463  				// We are not current, but current means exactly caught up. Under heavy publish
  2464  				// loads we may never reach this, so check if we are within 90% caught up.
  2465  				_, c, a := mset.node.Progress()
  2466  				if c == 0 {
  2467  					mset.mu.Unlock()
  2468  					continue
  2469  				}
  2470  				if p := float64(a) / float64(c) * 100.0; p < syncThreshold {
  2471  					mset.mu.Unlock()
  2472  					continue
  2473  				} else {
  2474  					s.Debugf("Stream '%s > %s' enabling direct gets at %.0f%% synchronized",
  2475  						sa.Client.serviceAccount(), sa.Config.Name, p)
  2476  				}
  2477  			}
  2478  			// We are current, cancel monitoring and create the direct subs as needed.
  2479  			if ad {
  2480  				mset.subscribeToDirect()
  2481  			}
  2482  			if md {
  2483  				mset.subscribeToMirrorDirect()
  2484  			}
  2485  			mset.mu.Unlock()
  2486  			// Stop direct monitoring.
  2487  			stopDirectMonitoring()
  2488  
  2489  		case <-t.C:
  2490  			doSnapshot()
  2491  
  2492  		case <-uch:
  2493  			// keep stream assignment current
  2494  			sa = mset.streamAssignment()
  2495  
  2496  			// keep peer list up to date with config
  2497  			js.checkPeers(mset.raftGroup())
  2498  			// We get this when we have a new stream assignment caused by an update.
  2499  			// We want to know if we are migrating.
  2500  			if migrating := mset.isMigrating(); migrating {
  2501  				if isLeader && mmtc == nil {
  2502  					startMigrationMonitoring()
  2503  				}
  2504  			} else {
  2505  				stopMigrationMonitoring()
  2506  			}
  2507  		case <-mmtc:
  2508  			if !isLeader {
  2509  				// We are no longer leader, so not our job.
  2510  				stopMigrationMonitoring()
  2511  				continue
  2512  			}
  2513  
  2514  			// Check to see where we are..
  2515  			rg := mset.raftGroup()
  2516  
  2517  			// Track the new peers and check the ones that are current.
  2518  			mset.mu.RLock()
  2519  			replicas := mset.cfg.Replicas
  2520  			mset.mu.RUnlock()
  2521  			if len(rg.Peers) <= replicas {
  2522  				// Migration no longer happening, so not our job anymore
  2523  				stopMigrationMonitoring()
  2524  				continue
  2525  			}
  2526  
  2527  			// Make sure we have correct cluster information on the other peers.
  2528  			ci := js.clusterInfo(rg)
  2529  			mset.checkClusterInfo(ci)
  2530  
  2531  			newPeers, oldPeers, newPeerSet, oldPeerSet := genPeerInfo(rg.Peers, len(rg.Peers)-replicas)
  2532  
  2533  			// If we are part of the new peerset and we have been passed the baton.
  2534  			// We will handle scale down.
  2535  			if newPeerSet[ourPeerId] {
  2536  				// First need to check on any consumers and make sure they have moved properly before scaling down ourselves.
  2537  				js.mu.RLock()
  2538  				var needToWait bool
  2539  				for name, c := range sa.consumers {
  2540  					for _, peer := range c.Group.Peers {
  2541  						// If we have peers still in the old set block.
  2542  						if oldPeerSet[peer] {
  2543  							s.Debugf("Scale down of '%s > %s' blocked by consumer '%s'", accName, sa.Config.Name, name)
  2544  							needToWait = true
  2545  							break
  2546  						}
  2547  					}
  2548  					if needToWait {
  2549  						break
  2550  					}
  2551  				}
  2552  				js.mu.RUnlock()
  2553  				if needToWait {
  2554  					continue
  2555  				}
  2556  
  2557  				// We are good to go, can scale down here.
  2558  				for _, p := range oldPeers {
  2559  					n.ProposeRemovePeer(p)
  2560  				}
  2561  
  2562  				csa := sa.copyGroup()
  2563  				csa.Group.Peers = newPeers
  2564  				csa.Group.Preferred = ourPeerId
  2565  				csa.Group.Cluster = s.cachedClusterName()
  2566  				cc.meta.ForwardProposal(encodeUpdateStreamAssignment(csa))
  2567  				s.Noticef("Scaling down '%s > %s' to %+v", accName, sa.Config.Name, s.peerSetToNames(newPeers))
  2568  			} else {
  2569  				// We are the old leader here, from the original peer set.
  2570  				// We are simply waiting on the new peerset to be caught up so we can transfer leadership.
  2571  				var newLeaderPeer, newLeader string
  2572  				neededCurrent, current := replicas/2+1, 0
  2573  
  2574  				for _, r := range ci.Replicas {
  2575  					if r.Current && newPeerSet[r.Peer] {
  2576  						current++
  2577  						if newLeader == _EMPTY_ {
  2578  							newLeaderPeer, newLeader = r.Peer, r.Name
  2579  						}
  2580  					}
  2581  				}
  2582  				// Check if we have a quorom.
  2583  				if current >= neededCurrent {
  2584  					s.Noticef("Transfer of stream leader for '%s > %s' to '%s'", accName, sa.Config.Name, newLeader)
  2585  					n.UpdateKnownPeers(newPeers)
  2586  					n.StepDown(newLeaderPeer)
  2587  				}
  2588  			}
  2589  
  2590  		case err := <-restoreDoneCh:
  2591  			// We have completed a restore from snapshot on this server. The stream assignment has
  2592  			// already been assigned but the replicas will need to catch up out of band. Consumers
  2593  			// will need to be assigned by forwarding the proposal and stamping the initial state.
  2594  			s.Debugf("Stream restore for '%s > %s' completed", sa.Client.serviceAccount(), sa.Config.Name)
  2595  			if err != nil {
  2596  				s.Debugf("Stream restore failed: %v", err)
  2597  			}
  2598  			isRestore = false
  2599  			sa.Restore = nil
  2600  			// If we were successful lookup up our stream now.
  2601  			if err == nil {
  2602  				if mset, err = acc.lookupStream(sa.Config.Name); mset != nil {
  2603  					mset.monitorWg.Add(1)
  2604  					defer mset.monitorWg.Done()
  2605  					mset.setStreamAssignment(sa)
  2606  					// Make sure to update our updateC which would have been nil.
  2607  					uch = mset.updateC()
  2608  					// Also update our mqch
  2609  					mqch = mset.monitorQuitC()
  2610  				}
  2611  			}
  2612  			if err != nil {
  2613  				if mset != nil {
  2614  					mset.delete()
  2615  				}
  2616  				js.mu.Lock()
  2617  				sa.err = err
  2618  				if n != nil {
  2619  					n.Delete()
  2620  				}
  2621  				result := &streamAssignmentResult{
  2622  					Account: sa.Client.serviceAccount(),
  2623  					Stream:  sa.Config.Name,
  2624  					Restore: &JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}},
  2625  				}
  2626  				result.Restore.Error = NewJSStreamAssignmentError(err, Unless(err))
  2627  				js.mu.Unlock()
  2628  				// Send response to the metadata leader. They will forward to the user as needed.
  2629  				s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result)
  2630  				return
  2631  			}
  2632  
  2633  			if !isLeader {
  2634  				panic("Finished restore but not leader")
  2635  			}
  2636  			// Trigger the stream followers to catchup.
  2637  			if n = mset.raftNode(); n != nil {
  2638  				n.SendSnapshot(mset.stateSnapshot())
  2639  			}
  2640  			js.processStreamLeaderChange(mset, isLeader)
  2641  
  2642  			// Check to see if we have restored consumers here.
  2643  			// These are not currently assigned so we will need to do so here.
  2644  			if consumers := mset.getPublicConsumers(); len(consumers) > 0 {
  2645  				for _, o := range consumers {
  2646  					name, cfg := o.String(), o.config()
  2647  					rg := cc.createGroupForConsumer(&cfg, sa)
  2648  					// Pick a preferred leader.
  2649  					rg.setPreferred()
  2650  
  2651  					// Place our initial state here as well for assignment distribution.
  2652  					state, _ := o.store.State()
  2653  					ca := &consumerAssignment{
  2654  						Group:   rg,
  2655  						Stream:  sa.Config.Name,
  2656  						Name:    name,
  2657  						Config:  &cfg,
  2658  						Client:  sa.Client,
  2659  						Created: o.createdTime(),
  2660  						State:   state,
  2661  					}
  2662  
  2663  					// We make these compressed in case state is complex.
  2664  					addEntry := encodeAddConsumerAssignmentCompressed(ca)
  2665  					cc.meta.ForwardProposal(addEntry)
  2666  
  2667  					// Check to make sure we see the assignment.
  2668  					go func() {
  2669  						ticker := time.NewTicker(time.Second)
  2670  						defer ticker.Stop()
  2671  						for range ticker.C {
  2672  							js.mu.RLock()
  2673  							ca, meta := js.consumerAssignment(ca.Client.serviceAccount(), sa.Config.Name, name), cc.meta
  2674  							js.mu.RUnlock()
  2675  							if ca == nil {
  2676  								s.Warnf("Consumer assignment has not been assigned, retrying")
  2677  								if meta != nil {
  2678  									meta.ForwardProposal(addEntry)
  2679  								} else {
  2680  									return
  2681  								}
  2682  							} else {
  2683  								return
  2684  							}
  2685  						}
  2686  					}()
  2687  				}
  2688  			}
  2689  		}
  2690  	}
  2691  }
  2692  
  2693  // Determine if we are migrating
  2694  func (mset *stream) isMigrating() bool {
  2695  	if mset == nil {
  2696  		return false
  2697  	}
  2698  
  2699  	mset.mu.RLock()
  2700  	js, sa := mset.js, mset.sa
  2701  	mset.mu.RUnlock()
  2702  
  2703  	js.mu.RLock()
  2704  	defer js.mu.RUnlock()
  2705  
  2706  	// During migration we will always be R>1, even when we start R1.
  2707  	// So if we do not have a group or node we no we are not migrating.
  2708  	if sa == nil || sa.Group == nil || sa.Group.node == nil {
  2709  		return false
  2710  	}
  2711  	// The sign of migration is if our group peer count != configured replica count.
  2712  	if sa.Config.Replicas == len(sa.Group.Peers) {
  2713  		return false
  2714  	}
  2715  	return true
  2716  }
  2717  
  2718  // resetClusteredState is called when a clustered stream had an error (e.g sequence mismatch, bad snapshot) and needs to be reset.
  2719  func (mset *stream) resetClusteredState(err error) bool {
  2720  	mset.mu.RLock()
  2721  	s, js, jsa, sa, acc, node := mset.srv, mset.js, mset.jsa, mset.sa, mset.acc, mset.node
  2722  	stype, isLeader, tierName, replicas := mset.cfg.Storage, mset.isLeader(), mset.tier, mset.cfg.Replicas
  2723  	mset.mu.RUnlock()
  2724  
  2725  	// Stepdown regardless if we are the leader here.
  2726  	if isLeader && node != nil {
  2727  		node.StepDown()
  2728  	}
  2729  
  2730  	// If we detect we are shutting down just return.
  2731  	if js != nil && js.isShuttingDown() {
  2732  		s.Debugf("Will not reset stream, jetstream shutting down")
  2733  		return false
  2734  	}
  2735  
  2736  	// Server
  2737  	if js.limitsExceeded(stype) {
  2738  		s.Warnf("Will not reset stream, server resources exceeded")
  2739  		return false
  2740  	}
  2741  
  2742  	// Account
  2743  	if exceeded, _ := jsa.limitsExceeded(stype, tierName, replicas); exceeded {
  2744  		s.Warnf("stream '%s > %s' errored, account resources exceeded", acc, mset.name())
  2745  		return false
  2746  	}
  2747  
  2748  	// We delete our raft state. Will recreate.
  2749  	if node != nil {
  2750  		node.Delete()
  2751  	}
  2752  
  2753  	// Preserve our current state and messages unless we have a first sequence mismatch.
  2754  	shouldDelete := err == errFirstSequenceMismatch
  2755  
  2756  	// Need to do the rest in a separate Go routine.
  2757  	go func() {
  2758  		mset.monitorWg.Wait()
  2759  		mset.resetAndWaitOnConsumers()
  2760  		// Stop our stream.
  2761  		mset.stop(shouldDelete, false)
  2762  
  2763  		if sa != nil {
  2764  			js.mu.Lock()
  2765  			if js.shuttingDown {
  2766  				js.mu.Unlock()
  2767  				return
  2768  			}
  2769  
  2770  			s.Warnf("Resetting stream cluster state for '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name)
  2771  			// Now wipe groups from assignments.
  2772  			sa.Group.node = nil
  2773  			var consumers []*consumerAssignment
  2774  			if cc := js.cluster; cc != nil && cc.meta != nil {
  2775  				ourID := cc.meta.ID()
  2776  				for _, ca := range sa.consumers {
  2777  					if rg := ca.Group; rg != nil && rg.isMember(ourID) {
  2778  						rg.node = nil // Erase group raft/node state.
  2779  						consumers = append(consumers, ca)
  2780  					}
  2781  				}
  2782  			}
  2783  			js.mu.Unlock()
  2784  
  2785  			// This will reset the stream and consumers.
  2786  			// Reset stream.
  2787  			js.processClusterCreateStream(acc, sa)
  2788  			// Reset consumers.
  2789  			for _, ca := range consumers {
  2790  				js.processClusterCreateConsumer(ca, nil, false)
  2791  			}
  2792  		}
  2793  	}()
  2794  
  2795  	return true
  2796  }
  2797  
  2798  func isControlHdr(hdr []byte) bool {
  2799  	return bytes.HasPrefix(hdr, []byte("NATS/1.0 100 "))
  2800  }
  2801  
  2802  // Apply our stream entries.
  2803  func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isRecovering bool) error {
  2804  	for _, e := range ce.Entries {
  2805  		if e.Type == EntryNormal {
  2806  			buf, op := e.Data, entryOp(e.Data[0])
  2807  			switch op {
  2808  			case streamMsgOp, compressedStreamMsgOp:
  2809  				if mset == nil {
  2810  					continue
  2811  				}
  2812  				s := js.srv
  2813  
  2814  				mbuf := buf[1:]
  2815  				if op == compressedStreamMsgOp {
  2816  					var err error
  2817  					mbuf, err = s2.Decode(nil, mbuf)
  2818  					if err != nil {
  2819  						panic(err.Error())
  2820  					}
  2821  				}
  2822  
  2823  				subject, reply, hdr, msg, lseq, ts, err := decodeStreamMsg(mbuf)
  2824  				if err != nil {
  2825  					if node := mset.raftNode(); node != nil {
  2826  						s.Errorf("JetStream cluster could not decode stream msg for '%s > %s' [%s]",
  2827  							mset.account(), mset.name(), node.Group())
  2828  					}
  2829  					panic(err.Error())
  2830  				}
  2831  
  2832  				// Check for flowcontrol here.
  2833  				if len(msg) == 0 && len(hdr) > 0 && reply != _EMPTY_ && isControlHdr(hdr) {
  2834  					if !isRecovering {
  2835  						mset.sendFlowControlReply(reply)
  2836  					}
  2837  					continue
  2838  				}
  2839  
  2840  				// Grab last sequence and CLFS.
  2841  				last, clfs := mset.lastSeqAndCLFS()
  2842  
  2843  				// We can skip if we know this is less than what we already have.
  2844  				if lseq-clfs < last {
  2845  					s.Debugf("Apply stream entries for '%s > %s' skipping message with sequence %d with last of %d",
  2846  						mset.account(), mset.name(), lseq+1-clfs, last)
  2847  
  2848  					mset.mu.Lock()
  2849  					// Check for any preAcks in case we are interest based.
  2850  					mset.clearAllPreAcks(lseq + 1 - mset.clfs)
  2851  					mset.mu.Unlock()
  2852  					continue
  2853  				}
  2854  
  2855  				// Skip by hand here since first msg special case.
  2856  				// Reason is sequence is unsigned and for lseq being 0
  2857  				// the lseq under stream would have to be -1.
  2858  				if lseq == 0 && last != 0 {
  2859  					continue
  2860  				}
  2861  
  2862  				// Messages to be skipped have no subject or timestamp or msg or hdr.
  2863  				if subject == _EMPTY_ && ts == 0 && len(msg) == 0 && len(hdr) == 0 {
  2864  					// Skip and update our lseq.
  2865  					last := mset.store.SkipMsg()
  2866  					mset.setLastSeq(last)
  2867  					mset.clearAllPreAcks(last)
  2868  					continue
  2869  				}
  2870  
  2871  				var mt *msgTrace
  2872  				// If not recovering, see if we find a message trace object for this
  2873  				// sequence. Only the leader that has proposed this entry will have
  2874  				// stored the trace info.
  2875  				if !isRecovering {
  2876  					mt = mset.getAndDeleteMsgTrace(lseq)
  2877  				}
  2878  				// Process the actual message here.
  2879  				err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt)
  2880  
  2881  				// If we have inflight make sure to clear after processing.
  2882  				// TODO(dlc) - technically check on inflight != nil could cause datarace.
  2883  				// But do not want to acquire lock since tracking this will be rare.
  2884  				if mset.inflight != nil {
  2885  					mset.clMu.Lock()
  2886  					delete(mset.inflight, lseq)
  2887  					mset.clMu.Unlock()
  2888  				}
  2889  
  2890  				if err != nil {
  2891  					if err == errLastSeqMismatch {
  2892  						var state StreamState
  2893  						mset.store.FastState(&state)
  2894  						// If we have no msgs and the other side is delivering us a sequence past where we
  2895  						// should be reset. This is possible if the other side has a stale snapshot and no longer
  2896  						// has those messages. So compact and retry to reset.
  2897  						if state.Msgs == 0 {
  2898  							mset.store.Compact(lseq + 1)
  2899  							// Retry
  2900  							err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt)
  2901  						}
  2902  					}
  2903  
  2904  					// Only return in place if we are going to reset our stream or we are out of space, or we are closed.
  2905  					if isClusterResetErr(err) || isOutOfSpaceErr(err) || err == errStreamClosed {
  2906  						return err
  2907  					}
  2908  					s.Debugf("Apply stream entries for '%s > %s' got error processing message: %v",
  2909  						mset.account(), mset.name(), err)
  2910  				}
  2911  
  2912  			case deleteMsgOp:
  2913  				md, err := decodeMsgDelete(buf[1:])
  2914  				if err != nil {
  2915  					if node := mset.raftNode(); node != nil {
  2916  						s := js.srv
  2917  						s.Errorf("JetStream cluster could not decode delete msg for '%s > %s' [%s]",
  2918  							mset.account(), mset.name(), node.Group())
  2919  					}
  2920  					panic(err.Error())
  2921  				}
  2922  				s, cc := js.server(), js.cluster
  2923  
  2924  				var removed bool
  2925  				if md.NoErase {
  2926  					removed, err = mset.removeMsg(md.Seq)
  2927  				} else {
  2928  					removed, err = mset.eraseMsg(md.Seq)
  2929  				}
  2930  
  2931  				// Cluster reset error.
  2932  				if err == ErrStoreEOF {
  2933  					return err
  2934  				}
  2935  
  2936  				if err != nil && !isRecovering {
  2937  					s.Debugf("JetStream cluster failed to delete stream msg %d from '%s > %s': %v",
  2938  						md.Seq, md.Client.serviceAccount(), md.Stream, err)
  2939  				}
  2940  
  2941  				js.mu.RLock()
  2942  				isLeader := cc.isStreamLeader(md.Client.serviceAccount(), md.Stream)
  2943  				js.mu.RUnlock()
  2944  
  2945  				if isLeader && !isRecovering {
  2946  					var resp = JSApiMsgDeleteResponse{ApiResponse: ApiResponse{Type: JSApiMsgDeleteResponseType}}
  2947  					if err != nil {
  2948  						resp.Error = NewJSStreamMsgDeleteFailedError(err, Unless(err))
  2949  						s.sendAPIErrResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp))
  2950  					} else if !removed {
  2951  						resp.Error = NewJSSequenceNotFoundError(md.Seq)
  2952  						s.sendAPIErrResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp))
  2953  					} else {
  2954  						resp.Success = true
  2955  						s.sendAPIResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp))
  2956  					}
  2957  				}
  2958  			case purgeStreamOp:
  2959  				sp, err := decodeStreamPurge(buf[1:])
  2960  				if err != nil {
  2961  					if node := mset.raftNode(); node != nil {
  2962  						s := js.srv
  2963  						s.Errorf("JetStream cluster could not decode purge msg for '%s > %s' [%s]",
  2964  							mset.account(), mset.name(), node.Group())
  2965  					}
  2966  					panic(err.Error())
  2967  				}
  2968  				// If no explicit request, fill in with leader stamped last sequence to protect ourselves on replay during server start.
  2969  				if sp.Request == nil || sp.Request.Sequence == 0 {
  2970  					purgeSeq := sp.LastSeq + 1
  2971  					if sp.Request == nil {
  2972  						sp.Request = &JSApiStreamPurgeRequest{Sequence: purgeSeq}
  2973  					} else if sp.Request.Keep == 0 {
  2974  						sp.Request.Sequence = purgeSeq
  2975  					} else if isRecovering {
  2976  						continue
  2977  					}
  2978  				}
  2979  
  2980  				s := js.server()
  2981  				purged, err := mset.purge(sp.Request)
  2982  				if err != nil {
  2983  					s.Warnf("JetStream cluster failed to purge stream %q for account %q: %v", sp.Stream, sp.Client.serviceAccount(), err)
  2984  				}
  2985  
  2986  				js.mu.RLock()
  2987  				isLeader := js.cluster.isStreamLeader(sp.Client.serviceAccount(), sp.Stream)
  2988  				js.mu.RUnlock()
  2989  
  2990  				if isLeader && !isRecovering {
  2991  					var resp = JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}}
  2992  					if err != nil {
  2993  						resp.Error = NewJSStreamGeneralError(err, Unless(err))
  2994  						s.sendAPIErrResponse(sp.Client, mset.account(), sp.Subject, sp.Reply, _EMPTY_, s.jsonResponse(resp))
  2995  					} else {
  2996  						resp.Purged = purged
  2997  						resp.Success = true
  2998  						s.sendAPIResponse(sp.Client, mset.account(), sp.Subject, sp.Reply, _EMPTY_, s.jsonResponse(resp))
  2999  					}
  3000  				}
  3001  			default:
  3002  				panic(fmt.Sprintf("JetStream Cluster Unknown group entry op type: %v", op))
  3003  			}
  3004  		} else if e.Type == EntrySnapshot {
  3005  			if mset == nil {
  3006  				return nil
  3007  			}
  3008  
  3009  			// Everything operates on new replicated state. Will convert legacy snapshots to this for processing.
  3010  			var ss *StreamReplicatedState
  3011  
  3012  			onBadState := func(err error) {
  3013  				// If we are the leader or recovering, meaning we own the snapshot,
  3014  				// we should stepdown and clear our raft state since our snapshot is bad.
  3015  				if isRecovering || mset.IsLeader() {
  3016  					mset.mu.RLock()
  3017  					s, accName, streamName := mset.srv, mset.acc.GetName(), mset.cfg.Name
  3018  					mset.mu.RUnlock()
  3019  					s.Warnf("Detected bad stream state, resetting '%s > %s'", accName, streamName)
  3020  					mset.resetClusteredState(err)
  3021  				}
  3022  			}
  3023  
  3024  			// Check if we are the new binary encoding.
  3025  			if IsEncodedStreamState(e.Data) {
  3026  				var err error
  3027  				ss, err = DecodeStreamState(e.Data)
  3028  				if err != nil {
  3029  					onBadState(err)
  3030  					return err
  3031  				}
  3032  			} else {
  3033  				var snap streamSnapshot
  3034  				if err := json.Unmarshal(e.Data, &snap); err != nil {
  3035  					onBadState(err)
  3036  					return err
  3037  				}
  3038  				// Convert over to StreamReplicatedState
  3039  				ss = &StreamReplicatedState{
  3040  					Msgs:     snap.Msgs,
  3041  					Bytes:    snap.Bytes,
  3042  					FirstSeq: snap.FirstSeq,
  3043  					LastSeq:  snap.LastSeq,
  3044  					Failed:   snap.Failed,
  3045  				}
  3046  				if len(snap.Deleted) > 0 {
  3047  					ss.Deleted = append(ss.Deleted, DeleteSlice(snap.Deleted))
  3048  				}
  3049  			}
  3050  
  3051  			if !isRecovering && !mset.IsLeader() {
  3052  				if err := mset.processSnapshot(ss); err != nil {
  3053  					return err
  3054  				}
  3055  			} else if isRecovering {
  3056  				// On recovery, reset CLFS/FAILED.
  3057  				mset.setCLFS(ss.Failed)
  3058  			}
  3059  		} else if e.Type == EntryRemovePeer {
  3060  			js.mu.RLock()
  3061  			var ourID string
  3062  			if js.cluster != nil && js.cluster.meta != nil {
  3063  				ourID = js.cluster.meta.ID()
  3064  			}
  3065  			js.mu.RUnlock()
  3066  			// We only need to do processing if this is us.
  3067  			if peer := string(e.Data); peer == ourID && mset != nil {
  3068  				// Double check here with the registered stream assignment.
  3069  				shouldRemove := true
  3070  				if sa := mset.streamAssignment(); sa != nil && sa.Group != nil {
  3071  					js.mu.RLock()
  3072  					shouldRemove = !sa.Group.isMember(ourID)
  3073  					js.mu.RUnlock()
  3074  				}
  3075  				if shouldRemove {
  3076  					mset.stop(true, false)
  3077  				}
  3078  			}
  3079  			return nil
  3080  		}
  3081  	}
  3082  	return nil
  3083  }
  3084  
  3085  // Returns the PeerInfo for all replicas of a raft node. This is different than node.Peers()
  3086  // and is used for external facing advisories.
  3087  func (s *Server) replicas(node RaftNode) []*PeerInfo {
  3088  	now := time.Now()
  3089  	var replicas []*PeerInfo
  3090  	for _, rp := range node.Peers() {
  3091  		if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil {
  3092  			si := sir.(nodeInfo)
  3093  			pi := &PeerInfo{Peer: rp.ID, Name: si.name, Current: rp.Current, Active: now.Sub(rp.Last), Offline: si.offline, Lag: rp.Lag}
  3094  			replicas = append(replicas, pi)
  3095  		}
  3096  	}
  3097  	return replicas
  3098  }
  3099  
  3100  // Will check our node peers and see if we should remove a peer.
  3101  func (js *jetStream) checkPeers(rg *raftGroup) {
  3102  	js.mu.Lock()
  3103  	defer js.mu.Unlock()
  3104  
  3105  	// FIXME(dlc) - Single replicas?
  3106  	if rg == nil || rg.node == nil {
  3107  		return
  3108  	}
  3109  	for _, peer := range rg.node.Peers() {
  3110  		if !rg.isMember(peer.ID) {
  3111  			rg.node.ProposeRemovePeer(peer.ID)
  3112  		}
  3113  	}
  3114  }
  3115  
  3116  // Process a leader change for the clustered stream.
  3117  func (js *jetStream) processStreamLeaderChange(mset *stream, isLeader bool) {
  3118  	if mset == nil {
  3119  		return
  3120  	}
  3121  	sa := mset.streamAssignment()
  3122  	if sa == nil {
  3123  		return
  3124  	}
  3125  
  3126  	// Clear inflight if we have it.
  3127  	mset.clMu.Lock()
  3128  	mset.inflight = nil
  3129  	mset.clMu.Unlock()
  3130  
  3131  	js.mu.Lock()
  3132  	s, account, err := js.srv, sa.Client.serviceAccount(), sa.err
  3133  	client, subject, reply := sa.Client, sa.Subject, sa.Reply
  3134  	hasResponded := sa.responded
  3135  	sa.responded = true
  3136  	peers := copyStrings(sa.Group.Peers)
  3137  	js.mu.Unlock()
  3138  
  3139  	streamName := mset.name()
  3140  
  3141  	if isLeader {
  3142  		s.Noticef("JetStream cluster new stream leader for '%s > %s'", account, streamName)
  3143  		s.sendStreamLeaderElectAdvisory(mset)
  3144  		// Check for peer removal and process here if needed.
  3145  		js.checkPeers(sa.Group)
  3146  		mset.checkAllowMsgCompress(peers)
  3147  	} else {
  3148  		// We are stepping down.
  3149  		// Make sure if we are doing so because we have lost quorum that we send the appropriate advisories.
  3150  		if node := mset.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second {
  3151  			s.sendStreamLostQuorumAdvisory(mset)
  3152  		}
  3153  	}
  3154  
  3155  	// Tell stream to switch leader status.
  3156  	mset.setLeader(isLeader)
  3157  
  3158  	if !isLeader || hasResponded {
  3159  		return
  3160  	}
  3161  
  3162  	acc, _ := s.LookupAccount(account)
  3163  	if acc == nil {
  3164  		return
  3165  	}
  3166  
  3167  	// Send our response.
  3168  	var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}
  3169  	if err != nil {
  3170  		resp.Error = NewJSStreamCreateError(err, Unless(err))
  3171  		s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  3172  	} else {
  3173  		resp.StreamInfo = &StreamInfo{
  3174  			Created:   mset.createdTime(),
  3175  			State:     mset.state(),
  3176  			Config:    mset.config(),
  3177  			Cluster:   js.clusterInfo(mset.raftGroup()),
  3178  			Sources:   mset.sourcesInfo(),
  3179  			Mirror:    mset.mirrorInfo(),
  3180  			TimeStamp: time.Now().UTC(),
  3181  		}
  3182  		resp.DidCreate = true
  3183  		s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  3184  		if node := mset.raftNode(); node != nil {
  3185  			mset.sendCreateAdvisory()
  3186  		}
  3187  	}
  3188  }
  3189  
  3190  // Fixed value ok for now.
  3191  const lostQuorumAdvInterval = 10 * time.Second
  3192  
  3193  // Determines if we should send lost quorum advisory. We throttle these after first one.
  3194  func (mset *stream) shouldSendLostQuorum() bool {
  3195  	mset.mu.Lock()
  3196  	defer mset.mu.Unlock()
  3197  	if time.Since(mset.lqsent) >= lostQuorumAdvInterval {
  3198  		mset.lqsent = time.Now()
  3199  		return true
  3200  	}
  3201  	return false
  3202  }
  3203  
  3204  func (s *Server) sendStreamLostQuorumAdvisory(mset *stream) {
  3205  	if mset == nil {
  3206  		return
  3207  	}
  3208  	node, stream, acc := mset.raftNode(), mset.name(), mset.account()
  3209  	if node == nil {
  3210  		return
  3211  	}
  3212  	if !mset.shouldSendLostQuorum() {
  3213  		return
  3214  	}
  3215  
  3216  	s.Warnf("JetStream cluster stream '%s > %s' has NO quorum, stalled", acc.GetName(), stream)
  3217  
  3218  	subj := JSAdvisoryStreamQuorumLostPre + "." + stream
  3219  	adv := &JSStreamQuorumLostAdvisory{
  3220  		TypedEvent: TypedEvent{
  3221  			Type: JSStreamQuorumLostAdvisoryType,
  3222  			ID:   nuid.Next(),
  3223  			Time: time.Now().UTC(),
  3224  		},
  3225  		Stream:   stream,
  3226  		Replicas: s.replicas(node),
  3227  		Domain:   s.getOpts().JetStreamDomain,
  3228  	}
  3229  
  3230  	// Send to the user's account if not the system account.
  3231  	if acc != s.SystemAccount() {
  3232  		s.publishAdvisory(acc, subj, adv)
  3233  	}
  3234  	// Now do system level one. Place account info in adv, and nil account means system.
  3235  	adv.Account = acc.GetName()
  3236  	s.publishAdvisory(nil, subj, adv)
  3237  }
  3238  
  3239  func (s *Server) sendStreamLeaderElectAdvisory(mset *stream) {
  3240  	if mset == nil {
  3241  		return
  3242  	}
  3243  	node, stream, acc := mset.raftNode(), mset.name(), mset.account()
  3244  	if node == nil {
  3245  		return
  3246  	}
  3247  	subj := JSAdvisoryStreamLeaderElectedPre + "." + stream
  3248  	adv := &JSStreamLeaderElectedAdvisory{
  3249  		TypedEvent: TypedEvent{
  3250  			Type: JSStreamLeaderElectedAdvisoryType,
  3251  			ID:   nuid.Next(),
  3252  			Time: time.Now().UTC(),
  3253  		},
  3254  		Stream:   stream,
  3255  		Leader:   s.serverNameForNode(node.GroupLeader()),
  3256  		Replicas: s.replicas(node),
  3257  		Domain:   s.getOpts().JetStreamDomain,
  3258  	}
  3259  
  3260  	// Send to the user's account if not the system account.
  3261  	if acc != s.SystemAccount() {
  3262  		s.publishAdvisory(acc, subj, adv)
  3263  	}
  3264  	// Now do system level one. Place account info in adv, and nil account means system.
  3265  	adv.Account = acc.GetName()
  3266  	s.publishAdvisory(nil, subj, adv)
  3267  }
  3268  
  3269  // Will lookup a stream assignment.
  3270  // Lock should be held.
  3271  func (js *jetStream) streamAssignment(account, stream string) (sa *streamAssignment) {
  3272  	cc := js.cluster
  3273  	if cc == nil {
  3274  		return nil
  3275  	}
  3276  
  3277  	if as := cc.streams[account]; as != nil {
  3278  		sa = as[stream]
  3279  	}
  3280  	return sa
  3281  }
  3282  
  3283  // processStreamAssignment is called when followers have replicated an assignment.
  3284  func (js *jetStream) processStreamAssignment(sa *streamAssignment) bool {
  3285  	js.mu.Lock()
  3286  	s, cc := js.srv, js.cluster
  3287  	accName, stream := sa.Client.serviceAccount(), sa.Config.Name
  3288  	noMeta := cc == nil || cc.meta == nil
  3289  	var ourID string
  3290  	if !noMeta {
  3291  		ourID = cc.meta.ID()
  3292  	}
  3293  	var isMember bool
  3294  	if sa.Group != nil && ourID != _EMPTY_ {
  3295  		isMember = sa.Group.isMember(ourID)
  3296  	}
  3297  
  3298  	// Remove this stream from the inflight proposals
  3299  	cc.removeInflightProposal(accName, sa.Config.Name)
  3300  
  3301  	if s == nil || noMeta {
  3302  		js.mu.Unlock()
  3303  		return false
  3304  	}
  3305  
  3306  	accStreams := cc.streams[accName]
  3307  	if accStreams == nil {
  3308  		accStreams = make(map[string]*streamAssignment)
  3309  	} else if osa := accStreams[stream]; osa != nil && osa != sa {
  3310  		// Copy over private existing state from former SA.
  3311  		if sa.Group != nil {
  3312  			sa.Group.node = osa.Group.node
  3313  		}
  3314  		sa.consumers = osa.consumers
  3315  		sa.responded = osa.responded
  3316  		sa.err = osa.err
  3317  	}
  3318  
  3319  	// Update our state.
  3320  	accStreams[stream] = sa
  3321  	cc.streams[accName] = accStreams
  3322  	hasResponded := sa.responded
  3323  	js.mu.Unlock()
  3324  
  3325  	acc, err := s.LookupAccount(accName)
  3326  	if err != nil {
  3327  		ll := fmt.Sprintf("Account [%s] lookup for stream create failed: %v", accName, err)
  3328  		if isMember {
  3329  			if !hasResponded {
  3330  				// If we can not lookup the account and we are a member, send this result back to the metacontroller leader.
  3331  				result := &streamAssignmentResult{
  3332  					Account:  accName,
  3333  					Stream:   stream,
  3334  					Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}},
  3335  				}
  3336  				result.Response.Error = NewJSNoAccountError()
  3337  				s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result)
  3338  			}
  3339  			s.Warnf(ll)
  3340  		} else {
  3341  			s.Debugf(ll)
  3342  		}
  3343  		return false
  3344  	}
  3345  
  3346  	var didRemove bool
  3347  
  3348  	// Check if this is for us..
  3349  	if isMember {
  3350  		js.processClusterCreateStream(acc, sa)
  3351  	} else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil {
  3352  		// We have one here even though we are not a member. This can happen on re-assignment.
  3353  		s.removeStream(ourID, mset, sa)
  3354  	}
  3355  
  3356  	// If this stream assignment does not have a sync subject (bug) set that the meta-leader should check when elected.
  3357  	if sa.Sync == _EMPTY_ {
  3358  		js.mu.Lock()
  3359  		cc.streamsCheck = true
  3360  		js.mu.Unlock()
  3361  		return false
  3362  	}
  3363  
  3364  	return didRemove
  3365  }
  3366  
  3367  // processUpdateStreamAssignment is called when followers have replicated an updated assignment.
  3368  func (js *jetStream) processUpdateStreamAssignment(sa *streamAssignment) {
  3369  	js.mu.RLock()
  3370  	s, cc := js.srv, js.cluster
  3371  	js.mu.RUnlock()
  3372  	if s == nil || cc == nil {
  3373  		// TODO(dlc) - debug at least
  3374  		return
  3375  	}
  3376  
  3377  	accName := sa.Client.serviceAccount()
  3378  	stream := sa.Config.Name
  3379  
  3380  	js.mu.Lock()
  3381  	if cc.meta == nil {
  3382  		js.mu.Unlock()
  3383  		return
  3384  	}
  3385  	ourID := cc.meta.ID()
  3386  
  3387  	var isMember bool
  3388  	if sa.Group != nil {
  3389  		isMember = sa.Group.isMember(ourID)
  3390  	}
  3391  
  3392  	accStreams := cc.streams[accName]
  3393  	if accStreams == nil {
  3394  		js.mu.Unlock()
  3395  		return
  3396  	}
  3397  	osa := accStreams[stream]
  3398  	if osa == nil {
  3399  		js.mu.Unlock()
  3400  		return
  3401  	}
  3402  
  3403  	// Copy over private existing state from former SA.
  3404  	if sa.Group != nil {
  3405  		sa.Group.node = osa.Group.node
  3406  	}
  3407  	sa.consumers = osa.consumers
  3408  	sa.err = osa.err
  3409  
  3410  	// If we detect we are scaling down to 1, non-clustered, and we had a previous node, clear it here.
  3411  	if sa.Config.Replicas == 1 && sa.Group.node != nil {
  3412  		sa.Group.node = nil
  3413  	}
  3414  
  3415  	// Update our state.
  3416  	accStreams[stream] = sa
  3417  	cc.streams[accName] = accStreams
  3418  
  3419  	// Make sure we respond if we are a member.
  3420  	if isMember {
  3421  		sa.responded = false
  3422  	} else {
  3423  		// Make sure to clean up any old node in case this stream moves back here.
  3424  		if sa.Group != nil {
  3425  			sa.Group.node = nil
  3426  		}
  3427  	}
  3428  	js.mu.Unlock()
  3429  
  3430  	acc, err := s.LookupAccount(accName)
  3431  	if err != nil {
  3432  		s.Warnf("Update Stream Account %s, error on lookup: %v", accName, err)
  3433  		return
  3434  	}
  3435  
  3436  	// Check if this is for us..
  3437  	if isMember {
  3438  		js.processClusterUpdateStream(acc, osa, sa)
  3439  	} else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil {
  3440  		// We have one here even though we are not a member. This can happen on re-assignment.
  3441  		s.removeStream(ourID, mset, sa)
  3442  	}
  3443  }
  3444  
  3445  // Common function to remove ourself from this server.
  3446  // This can happen on re-assignment, move, etc
  3447  func (s *Server) removeStream(ourID string, mset *stream, nsa *streamAssignment) {
  3448  	if mset == nil {
  3449  		return
  3450  	}
  3451  	// Make sure to use the new stream assignment, not our own.
  3452  	s.Debugf("JetStream removing stream '%s > %s' from this server", nsa.Client.serviceAccount(), nsa.Config.Name)
  3453  	if node := mset.raftNode(); node != nil {
  3454  		if node.Leader() {
  3455  			node.StepDown(nsa.Group.Preferred)
  3456  		}
  3457  		node.ProposeRemovePeer(ourID)
  3458  		// shutdown monitor by shutting down raft.
  3459  		node.Delete()
  3460  	}
  3461  
  3462  	var isShuttingDown bool
  3463  	// Make sure this node is no longer attached to our stream assignment.
  3464  	if js, _ := s.getJetStreamCluster(); js != nil {
  3465  		js.mu.Lock()
  3466  		nsa.Group.node = nil
  3467  		isShuttingDown = js.shuttingDown
  3468  		js.mu.Unlock()
  3469  	}
  3470  
  3471  	if !isShuttingDown {
  3472  		// wait for monitor to be shutdown.
  3473  		mset.monitorWg.Wait()
  3474  	}
  3475  	mset.stop(true, false)
  3476  }
  3477  
  3478  // processClusterUpdateStream is called when we have a stream assignment that
  3479  // has been updated for an existing assignment and we are a member.
  3480  func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAssignment) {
  3481  	if sa == nil {
  3482  		return
  3483  	}
  3484  
  3485  	js.mu.Lock()
  3486  	s, rg := js.srv, sa.Group
  3487  	client, subject, reply := sa.Client, sa.Subject, sa.Reply
  3488  	alreadyRunning, numReplicas := osa.Group.node != nil, len(rg.Peers)
  3489  	needsNode := rg.node == nil
  3490  	storage, cfg := sa.Config.Storage, sa.Config
  3491  	hasResponded := sa.responded
  3492  	sa.responded = true
  3493  	recovering := sa.recovering
  3494  	js.mu.Unlock()
  3495  
  3496  	mset, err := acc.lookupStream(cfg.Name)
  3497  	if err == nil && mset != nil {
  3498  		// Make sure we have not had a new group assigned to us.
  3499  		if osa.Group.Name != sa.Group.Name {
  3500  			s.Warnf("JetStream cluster detected stream remapping for '%s > %s' from %q to %q",
  3501  				acc, cfg.Name, osa.Group.Name, sa.Group.Name)
  3502  			mset.removeNode()
  3503  			alreadyRunning, needsNode = false, true
  3504  			// Make sure to clear from original.
  3505  			js.mu.Lock()
  3506  			osa.Group.node = nil
  3507  			js.mu.Unlock()
  3508  		}
  3509  
  3510  		var needsSetLeader bool
  3511  		if !alreadyRunning && numReplicas > 1 {
  3512  			if needsNode {
  3513  				mset.setLeader(false)
  3514  				js.createRaftGroup(acc.GetName(), rg, storage, pprofLabels{
  3515  					"type":    "stream",
  3516  					"account": mset.accName(),
  3517  					"stream":  mset.name(),
  3518  				})
  3519  			}
  3520  			mset.monitorWg.Add(1)
  3521  			// Start monitoring..
  3522  			s.startGoRoutine(
  3523  				func() { js.monitorStream(mset, sa, needsNode) },
  3524  				pprofLabels{
  3525  					"type":    "stream",
  3526  					"account": mset.accName(),
  3527  					"stream":  mset.name(),
  3528  				},
  3529  			)
  3530  		} else if numReplicas == 1 && alreadyRunning {
  3531  			// We downgraded to R1. Make sure we cleanup the raft node and the stream monitor.
  3532  			mset.removeNode()
  3533  			// Make sure we are leader now that we are R1.
  3534  			needsSetLeader = true
  3535  			// In case we need to shutdown the cluster specific subs, etc.
  3536  			mset.setLeader(false)
  3537  			js.mu.Lock()
  3538  			rg.node = nil
  3539  			js.mu.Unlock()
  3540  		}
  3541  		// Call update.
  3542  		if err = mset.updateWithAdvisory(cfg, !recovering); err != nil {
  3543  			s.Warnf("JetStream cluster error updating stream %q for account %q: %v", cfg.Name, acc.Name, err)
  3544  		}
  3545  		// Set the new stream assignment.
  3546  		mset.setStreamAssignment(sa)
  3547  		// Make sure we are the leader now that we are R1.
  3548  		if needsSetLeader {
  3549  			mset.setLeader(true)
  3550  		}
  3551  	}
  3552  
  3553  	// If not found we must be expanding into this node since if we are here we know we are a member.
  3554  	if err == ErrJetStreamStreamNotFound {
  3555  		js.processStreamAssignment(sa)
  3556  		return
  3557  	}
  3558  
  3559  	if err != nil {
  3560  		js.mu.Lock()
  3561  		sa.err = err
  3562  		result := &streamAssignmentResult{
  3563  			Account:  sa.Client.serviceAccount(),
  3564  			Stream:   sa.Config.Name,
  3565  			Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}},
  3566  			Update:   true,
  3567  		}
  3568  		result.Response.Error = NewJSStreamGeneralError(err, Unless(err))
  3569  		js.mu.Unlock()
  3570  
  3571  		// Send response to the metadata leader. They will forward to the user as needed.
  3572  		s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result)
  3573  		return
  3574  	}
  3575  
  3576  	isLeader := mset.IsLeader()
  3577  
  3578  	// Check for missing syncSubject bug.
  3579  	if isLeader && osa != nil && osa.Sync == _EMPTY_ {
  3580  		if node := mset.raftNode(); node != nil {
  3581  			node.StepDown()
  3582  		}
  3583  		return
  3584  	}
  3585  
  3586  	// If we were a single node being promoted assume leadership role for purpose of responding.
  3587  	if !hasResponded && !isLeader && !alreadyRunning {
  3588  		isLeader = true
  3589  	}
  3590  
  3591  	// Check if we should bail.
  3592  	if !isLeader || hasResponded || recovering {
  3593  		return
  3594  	}
  3595  
  3596  	// Send our response.
  3597  	var resp = JSApiStreamUpdateResponse{ApiResponse: ApiResponse{Type: JSApiStreamUpdateResponseType}}
  3598  	resp.StreamInfo = &StreamInfo{
  3599  		Created:   mset.createdTime(),
  3600  		State:     mset.state(),
  3601  		Config:    mset.config(),
  3602  		Cluster:   js.clusterInfo(mset.raftGroup()),
  3603  		Mirror:    mset.mirrorInfo(),
  3604  		Sources:   mset.sourcesInfo(),
  3605  		TimeStamp: time.Now().UTC(),
  3606  	}
  3607  
  3608  	s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  3609  }
  3610  
  3611  // processClusterCreateStream is called when we have a stream assignment that
  3612  // has been committed and this server is a member of the peer group.
  3613  func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignment) {
  3614  	if sa == nil {
  3615  		return
  3616  	}
  3617  
  3618  	js.mu.RLock()
  3619  	s, rg := js.srv, sa.Group
  3620  	alreadyRunning := rg.node != nil
  3621  	storage := sa.Config.Storage
  3622  	restore := sa.Restore
  3623  	js.mu.RUnlock()
  3624  
  3625  	// Process the raft group and make sure it's running if needed.
  3626  	err := js.createRaftGroup(acc.GetName(), rg, storage, pprofLabels{
  3627  		"type":    "stream",
  3628  		"account": acc.Name,
  3629  		"stream":  sa.Config.Name,
  3630  	})
  3631  
  3632  	// If we are restoring, create the stream if we are R>1 and not the preferred who handles the
  3633  	// receipt of the snapshot itself.
  3634  	shouldCreate := true
  3635  	if restore != nil {
  3636  		if len(rg.Peers) == 1 || rg.node != nil && rg.node.ID() == rg.Preferred {
  3637  			shouldCreate = false
  3638  		} else {
  3639  			js.mu.Lock()
  3640  			sa.Restore = nil
  3641  			js.mu.Unlock()
  3642  		}
  3643  	}
  3644  
  3645  	// Our stream.
  3646  	var mset *stream
  3647  
  3648  	// Process here if not restoring or not the leader.
  3649  	if shouldCreate && err == nil {
  3650  		// Go ahead and create or update the stream.
  3651  		mset, err = acc.lookupStream(sa.Config.Name)
  3652  		if err == nil && mset != nil {
  3653  			osa := mset.streamAssignment()
  3654  			// If we already have a stream assignment and they are the same exact config, short circuit here.
  3655  			if osa != nil {
  3656  				if reflect.DeepEqual(osa.Config, sa.Config) {
  3657  					if sa.Group.Name == osa.Group.Name && reflect.DeepEqual(sa.Group.Peers, osa.Group.Peers) {
  3658  						// Since this already exists we know it succeeded, just respond to this caller.
  3659  						js.mu.RLock()
  3660  						client, subject, reply, recovering := sa.Client, sa.Subject, sa.Reply, sa.recovering
  3661  						js.mu.RUnlock()
  3662  
  3663  						if !recovering {
  3664  							var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}
  3665  							resp.StreamInfo = &StreamInfo{
  3666  								Created:   mset.createdTime(),
  3667  								State:     mset.state(),
  3668  								Config:    mset.config(),
  3669  								Cluster:   js.clusterInfo(mset.raftGroup()),
  3670  								Sources:   mset.sourcesInfo(),
  3671  								Mirror:    mset.mirrorInfo(),
  3672  								TimeStamp: time.Now().UTC(),
  3673  							}
  3674  							s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  3675  						}
  3676  						return
  3677  					} else {
  3678  						// We had a bug where we could have multiple assignments for the same
  3679  						// stream but with different group assignments, including multiple raft
  3680  						// groups. So check for that here. We can only bet on the last one being
  3681  						// consistent in the long run, so let it continue if we see this condition.
  3682  						s.Warnf("JetStream cluster detected duplicate assignment for stream %q for account %q", sa.Config.Name, acc.Name)
  3683  						if osa.Group.node != nil && osa.Group.node != sa.Group.node {
  3684  							osa.Group.node.Delete()
  3685  							osa.Group.node = nil
  3686  						}
  3687  					}
  3688  				}
  3689  			}
  3690  			mset.setStreamAssignment(sa)
  3691  			// Check if our config has really been updated.
  3692  			if !reflect.DeepEqual(mset.config(), sa.Config) {
  3693  				if err = mset.updateWithAdvisory(sa.Config, false); err != nil {
  3694  					s.Warnf("JetStream cluster error updating stream %q for account %q: %v", sa.Config.Name, acc.Name, err)
  3695  					if osa != nil {
  3696  						// Process the raft group and make sure it's running if needed.
  3697  						js.createRaftGroup(acc.GetName(), osa.Group, storage, pprofLabels{
  3698  							"type":    "stream",
  3699  							"account": mset.accName(),
  3700  							"stream":  mset.name(),
  3701  						})
  3702  						mset.setStreamAssignment(osa)
  3703  					}
  3704  					if rg.node != nil {
  3705  						rg.node.Delete()
  3706  						rg.node = nil
  3707  					}
  3708  				}
  3709  			}
  3710  		} else if err == NewJSStreamNotFoundError() {
  3711  			// Add in the stream here.
  3712  			mset, err = acc.addStreamWithAssignment(sa.Config, nil, sa)
  3713  		}
  3714  		if mset != nil {
  3715  			mset.setCreatedTime(sa.Created)
  3716  		}
  3717  	}
  3718  
  3719  	// This is an error condition.
  3720  	if err != nil {
  3721  		if IsNatsErr(err, JSStreamStoreFailedF) {
  3722  			s.Warnf("Stream create failed for '%s > %s': %v", sa.Client.serviceAccount(), sa.Config.Name, err)
  3723  			err = errStreamStoreFailed
  3724  		}
  3725  		js.mu.Lock()
  3726  
  3727  		sa.err = err
  3728  		hasResponded := sa.responded
  3729  
  3730  		// If out of space do nothing for now.
  3731  		if isOutOfSpaceErr(err) {
  3732  			hasResponded = true
  3733  		}
  3734  
  3735  		if rg.node != nil {
  3736  			rg.node.Delete()
  3737  		}
  3738  
  3739  		var result *streamAssignmentResult
  3740  		if !hasResponded {
  3741  			result = &streamAssignmentResult{
  3742  				Account:  sa.Client.serviceAccount(),
  3743  				Stream:   sa.Config.Name,
  3744  				Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}},
  3745  			}
  3746  			result.Response.Error = NewJSStreamCreateError(err, Unless(err))
  3747  		}
  3748  		js.mu.Unlock()
  3749  
  3750  		// Send response to the metadata leader. They will forward to the user as needed.
  3751  		if result != nil {
  3752  			s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result)
  3753  		}
  3754  		return
  3755  	}
  3756  
  3757  	// Re-capture node.
  3758  	js.mu.RLock()
  3759  	node := rg.node
  3760  	js.mu.RUnlock()
  3761  
  3762  	// Start our monitoring routine.
  3763  	if node != nil {
  3764  		if !alreadyRunning {
  3765  			if mset != nil {
  3766  				mset.monitorWg.Add(1)
  3767  			}
  3768  			s.startGoRoutine(
  3769  				func() { js.monitorStream(mset, sa, false) },
  3770  				pprofLabels{
  3771  					"type":    "stream",
  3772  					"account": mset.accName(),
  3773  					"stream":  mset.name(),
  3774  				},
  3775  			)
  3776  		}
  3777  	} else {
  3778  		// Single replica stream, process manually here.
  3779  		// If we are restoring, process that first.
  3780  		if sa.Restore != nil {
  3781  			// We are restoring a stream here.
  3782  			restoreDoneCh := s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_)
  3783  			s.startGoRoutine(func() {
  3784  				defer s.grWG.Done()
  3785  				select {
  3786  				case err := <-restoreDoneCh:
  3787  					if err == nil {
  3788  						mset, err = acc.lookupStream(sa.Config.Name)
  3789  						if mset != nil {
  3790  							mset.setStreamAssignment(sa)
  3791  							mset.setCreatedTime(sa.Created)
  3792  						}
  3793  					}
  3794  					if err != nil {
  3795  						if mset != nil {
  3796  							mset.delete()
  3797  						}
  3798  						js.mu.Lock()
  3799  						sa.err = err
  3800  						result := &streamAssignmentResult{
  3801  							Account: sa.Client.serviceAccount(),
  3802  							Stream:  sa.Config.Name,
  3803  							Restore: &JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}},
  3804  						}
  3805  						result.Restore.Error = NewJSStreamRestoreError(err, Unless(err))
  3806  						js.mu.Unlock()
  3807  						// Send response to the metadata leader. They will forward to the user as needed.
  3808  						b, _ := json.Marshal(result) // Avoids auto-processing and doing fancy json with newlines.
  3809  						s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, b)
  3810  						return
  3811  					}
  3812  					js.processStreamLeaderChange(mset, true)
  3813  
  3814  					// Check to see if we have restored consumers here.
  3815  					// These are not currently assigned so we will need to do so here.
  3816  					if consumers := mset.getPublicConsumers(); len(consumers) > 0 {
  3817  						js.mu.RLock()
  3818  						cc := js.cluster
  3819  						js.mu.RUnlock()
  3820  
  3821  						for _, o := range consumers {
  3822  							name, cfg := o.String(), o.config()
  3823  							rg := cc.createGroupForConsumer(&cfg, sa)
  3824  
  3825  							// Place our initial state here as well for assignment distribution.
  3826  							ca := &consumerAssignment{
  3827  								Group:   rg,
  3828  								Stream:  sa.Config.Name,
  3829  								Name:    name,
  3830  								Config:  &cfg,
  3831  								Client:  sa.Client,
  3832  								Created: o.createdTime(),
  3833  							}
  3834  
  3835  							addEntry := encodeAddConsumerAssignment(ca)
  3836  							cc.meta.ForwardProposal(addEntry)
  3837  
  3838  							// Check to make sure we see the assignment.
  3839  							go func() {
  3840  								ticker := time.NewTicker(time.Second)
  3841  								defer ticker.Stop()
  3842  								for range ticker.C {
  3843  									js.mu.RLock()
  3844  									ca, meta := js.consumerAssignment(ca.Client.serviceAccount(), sa.Config.Name, name), cc.meta
  3845  									js.mu.RUnlock()
  3846  									if ca == nil {
  3847  										s.Warnf("Consumer assignment has not been assigned, retrying")
  3848  										if meta != nil {
  3849  											meta.ForwardProposal(addEntry)
  3850  										} else {
  3851  											return
  3852  										}
  3853  									} else {
  3854  										return
  3855  									}
  3856  								}
  3857  							}()
  3858  						}
  3859  					}
  3860  				case <-s.quitCh:
  3861  					return
  3862  				}
  3863  			})
  3864  		} else {
  3865  			js.processStreamLeaderChange(mset, true)
  3866  		}
  3867  	}
  3868  }
  3869  
  3870  // processStreamRemoval is called when followers have replicated an assignment.
  3871  func (js *jetStream) processStreamRemoval(sa *streamAssignment) {
  3872  	js.mu.Lock()
  3873  	s, cc := js.srv, js.cluster
  3874  	if s == nil || cc == nil || cc.meta == nil {
  3875  		// TODO(dlc) - debug at least
  3876  		js.mu.Unlock()
  3877  		return
  3878  	}
  3879  	stream := sa.Config.Name
  3880  	isMember := sa.Group.isMember(cc.meta.ID())
  3881  	wasLeader := cc.isStreamLeader(sa.Client.serviceAccount(), stream)
  3882  
  3883  	// Check if we already have this assigned.
  3884  	accStreams := cc.streams[sa.Client.serviceAccount()]
  3885  	needDelete := accStreams != nil && accStreams[stream] != nil
  3886  	if needDelete {
  3887  		delete(accStreams, stream)
  3888  		if len(accStreams) == 0 {
  3889  			delete(cc.streams, sa.Client.serviceAccount())
  3890  		}
  3891  	}
  3892  	js.mu.Unlock()
  3893  
  3894  	if needDelete {
  3895  		js.processClusterDeleteStream(sa, isMember, wasLeader)
  3896  	}
  3897  }
  3898  
  3899  func (js *jetStream) processClusterDeleteStream(sa *streamAssignment, isMember, wasLeader bool) {
  3900  	if sa == nil {
  3901  		return
  3902  	}
  3903  	js.mu.RLock()
  3904  	s := js.srv
  3905  	node := sa.Group.node
  3906  	hadLeader := node == nil || node.GroupLeader() != noLeader
  3907  	offline := s.allPeersOffline(sa.Group)
  3908  	var isMetaLeader bool
  3909  	if cc := js.cluster; cc != nil {
  3910  		isMetaLeader = cc.isLeader()
  3911  	}
  3912  	recovering := sa.recovering
  3913  	js.mu.RUnlock()
  3914  
  3915  	stopped := false
  3916  	var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}}
  3917  	var err error
  3918  	var acc *Account
  3919  
  3920  	// Go ahead and delete the stream if we have it and the account here.
  3921  	if acc, _ = s.LookupAccount(sa.Client.serviceAccount()); acc != nil {
  3922  		if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil {
  3923  			// shut down monitor by shutting down raft
  3924  			if n := mset.raftNode(); n != nil {
  3925  				n.Delete()
  3926  			}
  3927  			// wait for monitor to be shut down
  3928  			mset.monitorWg.Wait()
  3929  			err = mset.stop(true, wasLeader)
  3930  			stopped = true
  3931  		} else if isMember {
  3932  			s.Warnf("JetStream failed to lookup running stream while removing stream '%s > %s' from this server",
  3933  				sa.Client.serviceAccount(), sa.Config.Name)
  3934  		}
  3935  	} else if isMember {
  3936  		s.Warnf("JetStream failed to lookup account while removing stream '%s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name)
  3937  	}
  3938  
  3939  	// Always delete the node if present.
  3940  	if node != nil {
  3941  		node.Delete()
  3942  	}
  3943  
  3944  	// This is a stop gap cleanup in case
  3945  	// 1) the account does not exist (and mset couldn't be stopped) and/or
  3946  	// 2) node was nil (and couldn't be deleted)
  3947  	if !stopped || node == nil {
  3948  		if sacc := s.SystemAccount(); sacc != nil {
  3949  			saccName := sacc.GetName()
  3950  			os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, sa.Group.Name))
  3951  			// cleanup dependent consumer groups
  3952  			if !stopped {
  3953  				for _, ca := range sa.consumers {
  3954  					// Make sure we cleanup any possible running nodes for the consumers.
  3955  					if isMember && ca.Group != nil && ca.Group.node != nil {
  3956  						ca.Group.node.Delete()
  3957  					}
  3958  					os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, ca.Group.Name))
  3959  				}
  3960  			}
  3961  		}
  3962  	}
  3963  	accDir := filepath.Join(js.config.StoreDir, sa.Client.serviceAccount())
  3964  	streamDir := filepath.Join(accDir, streamsDir)
  3965  	os.RemoveAll(filepath.Join(streamDir, sa.Config.Name))
  3966  
  3967  	// no op if not empty
  3968  	os.Remove(streamDir)
  3969  	os.Remove(accDir)
  3970  
  3971  	// Normally we want only the leader to respond here, but if we had no leader then all members will respond to make
  3972  	// sure we get feedback to the user.
  3973  	if !isMember || (hadLeader && !wasLeader) {
  3974  		// If all the peers are offline and we are the meta leader we will also respond, so suppress returning here.
  3975  		if !(offline && isMetaLeader) {
  3976  			return
  3977  		}
  3978  	}
  3979  
  3980  	// Do not respond if the account does not exist any longer
  3981  	if acc == nil || recovering {
  3982  		return
  3983  	}
  3984  
  3985  	if err != nil {
  3986  		resp.Error = NewJSStreamGeneralError(err, Unless(err))
  3987  		s.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp))
  3988  	} else {
  3989  		resp.Success = true
  3990  		s.sendAPIResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp))
  3991  	}
  3992  }
  3993  
  3994  // processConsumerAssignment is called when followers have replicated an assignment for a consumer.
  3995  func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) {
  3996  	js.mu.RLock()
  3997  	s, cc := js.srv, js.cluster
  3998  	accName, stream, consumerName := ca.Client.serviceAccount(), ca.Stream, ca.Name
  3999  	noMeta := cc == nil || cc.meta == nil
  4000  	shuttingDown := js.shuttingDown
  4001  	var ourID string
  4002  	if !noMeta {
  4003  		ourID = cc.meta.ID()
  4004  	}
  4005  	var isMember bool
  4006  	if ca.Group != nil && ourID != _EMPTY_ {
  4007  		isMember = ca.Group.isMember(ourID)
  4008  	}
  4009  	js.mu.RUnlock()
  4010  
  4011  	if s == nil || noMeta || shuttingDown {
  4012  		return
  4013  	}
  4014  
  4015  	sa := js.streamAssignment(accName, stream)
  4016  	if sa == nil {
  4017  		s.Debugf("Consumer create failed, could not locate stream '%s > %s'", accName, stream)
  4018  		return
  4019  	}
  4020  
  4021  	// Might need this below.
  4022  	numReplicas := sa.Config.Replicas
  4023  
  4024  	// Track if this existed already.
  4025  	var wasExisting bool
  4026  
  4027  	// Check if we have an existing consumer assignment.
  4028  	js.mu.Lock()
  4029  	if sa.consumers == nil {
  4030  		sa.consumers = make(map[string]*consumerAssignment)
  4031  	} else if oca := sa.consumers[ca.Name]; oca != nil {
  4032  		wasExisting = true
  4033  		// Copy over private existing state from former SA.
  4034  		if ca.Group != nil {
  4035  			ca.Group.node = oca.Group.node
  4036  		}
  4037  		ca.responded = oca.responded
  4038  		ca.err = oca.err
  4039  	}
  4040  
  4041  	// Capture the optional state. We will pass it along if we are a member to apply.
  4042  	// This is only applicable when restoring a stream with consumers.
  4043  	state := ca.State
  4044  	ca.State = nil
  4045  
  4046  	// Place into our internal map under the stream assignment.
  4047  	// Ok to replace an existing one, we check on process call below.
  4048  	sa.consumers[ca.Name] = ca
  4049  	js.mu.Unlock()
  4050  
  4051  	acc, err := s.LookupAccount(accName)
  4052  	if err != nil {
  4053  		ll := fmt.Sprintf("Account [%s] lookup for consumer create failed: %v", accName, err)
  4054  		if isMember {
  4055  			if !js.isMetaRecovering() {
  4056  				// If we can not lookup the account and we are a member, send this result back to the metacontroller leader.
  4057  				result := &consumerAssignmentResult{
  4058  					Account:  accName,
  4059  					Stream:   stream,
  4060  					Consumer: consumerName,
  4061  					Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}},
  4062  				}
  4063  				result.Response.Error = NewJSNoAccountError()
  4064  				s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result)
  4065  			}
  4066  			s.Warnf(ll)
  4067  		} else {
  4068  			s.Debugf(ll)
  4069  		}
  4070  		return
  4071  	}
  4072  
  4073  	// Check if this is for us..
  4074  	if isMember {
  4075  		js.processClusterCreateConsumer(ca, state, wasExisting)
  4076  	} else {
  4077  		// We need to be removed here, we are no longer assigned.
  4078  		// Grab consumer if we have it.
  4079  		var o *consumer
  4080  		if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil {
  4081  			o = mset.lookupConsumer(ca.Name)
  4082  		}
  4083  
  4084  		// Check if we have a raft node running, meaning we are no longer part of the group but were.
  4085  		js.mu.Lock()
  4086  		if node := ca.Group.node; node != nil {
  4087  			// We have one here even though we are not a member. This can happen on re-assignment.
  4088  			s.Debugf("JetStream removing consumer '%s > %s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name, ca.Name)
  4089  			if node.Leader() {
  4090  				s.Debugf("JetStream consumer '%s > %s > %s' is being removed and was the leader, will perform stepdown",
  4091  					sa.Client.serviceAccount(), sa.Config.Name, ca.Name)
  4092  
  4093  				peers, cn := node.Peers(), s.cachedClusterName()
  4094  				migrating := numReplicas != len(peers)
  4095  
  4096  				// Select a new peer to transfer to. If we are a migrating make sure its from the new cluster.
  4097  				var npeer string
  4098  				for _, r := range peers {
  4099  					if !r.Current {
  4100  						continue
  4101  					}
  4102  					if !migrating {
  4103  						npeer = r.ID
  4104  						break
  4105  					} else if sir, ok := s.nodeToInfo.Load(r.ID); ok && sir != nil {
  4106  						si := sir.(nodeInfo)
  4107  						if si.cluster != cn {
  4108  							npeer = r.ID
  4109  							break
  4110  						}
  4111  					}
  4112  				}
  4113  				// Clear the raftnode from our consumer so that a subsequent o.delete will not also issue a stepdown.
  4114  				if o != nil {
  4115  					o.clearRaftNode()
  4116  				}
  4117  				// Manually handle the stepdown and deletion of the node.
  4118  				node.UpdateKnownPeers(ca.Group.Peers)
  4119  				node.StepDown(npeer)
  4120  				node.Delete()
  4121  			} else {
  4122  				node.UpdateKnownPeers(ca.Group.Peers)
  4123  			}
  4124  		}
  4125  		// Always clear the old node.
  4126  		ca.Group.node = nil
  4127  		ca.err = nil
  4128  		js.mu.Unlock()
  4129  
  4130  		if o != nil {
  4131  			o.deleteWithoutAdvisory()
  4132  		}
  4133  	}
  4134  }
  4135  
  4136  func (js *jetStream) processConsumerRemoval(ca *consumerAssignment) {
  4137  	js.mu.Lock()
  4138  	s, cc := js.srv, js.cluster
  4139  	if s == nil || cc == nil || cc.meta == nil {
  4140  		// TODO(dlc) - debug at least
  4141  		js.mu.Unlock()
  4142  		return
  4143  	}
  4144  	isMember := ca.Group.isMember(cc.meta.ID())
  4145  	wasLeader := cc.isConsumerLeader(ca.Client.serviceAccount(), ca.Stream, ca.Name)
  4146  
  4147  	// Delete from our state.
  4148  	var needDelete bool
  4149  	if accStreams := cc.streams[ca.Client.serviceAccount()]; accStreams != nil {
  4150  		if sa := accStreams[ca.Stream]; sa != nil && sa.consumers != nil && sa.consumers[ca.Name] != nil {
  4151  			oca := sa.consumers[ca.Name]
  4152  			// Make sure this removal is for what we have, otherwise ignore.
  4153  			if ca.Group != nil && oca.Group != nil && ca.Group.Name == oca.Group.Name {
  4154  				needDelete = true
  4155  				oca.deleted = true
  4156  				delete(sa.consumers, ca.Name)
  4157  			}
  4158  		}
  4159  	}
  4160  	js.mu.Unlock()
  4161  
  4162  	if needDelete {
  4163  		js.processClusterDeleteConsumer(ca, isMember, wasLeader)
  4164  	}
  4165  }
  4166  
  4167  type consumerAssignmentResult struct {
  4168  	Account  string                       `json:"account"`
  4169  	Stream   string                       `json:"stream"`
  4170  	Consumer string                       `json:"consumer"`
  4171  	Response *JSApiConsumerCreateResponse `json:"response,omitempty"`
  4172  }
  4173  
  4174  // processClusterCreateConsumer is when we are a member of the group and need to create the consumer.
  4175  func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state *ConsumerState, wasExisting bool) {
  4176  	if ca == nil {
  4177  		return
  4178  	}
  4179  	js.mu.RLock()
  4180  	s := js.srv
  4181  	rg := ca.Group
  4182  	alreadyRunning := rg != nil && rg.node != nil
  4183  	accName, stream, consumer := ca.Client.serviceAccount(), ca.Stream, ca.Name
  4184  	js.mu.RUnlock()
  4185  
  4186  	acc, err := s.LookupAccount(accName)
  4187  	if err != nil {
  4188  		s.Warnf("JetStream cluster failed to lookup axccount %q: %v", accName, err)
  4189  		return
  4190  	}
  4191  
  4192  	// Go ahead and create or update the consumer.
  4193  	mset, err := acc.lookupStream(stream)
  4194  	if err != nil {
  4195  		if !js.isMetaRecovering() {
  4196  			js.mu.Lock()
  4197  			s.Warnf("Consumer create failed, could not locate stream '%s > %s > %s'", ca.Client.serviceAccount(), ca.Stream, ca.Name)
  4198  			ca.err = NewJSStreamNotFoundError()
  4199  			result := &consumerAssignmentResult{
  4200  				Account:  ca.Client.serviceAccount(),
  4201  				Stream:   ca.Stream,
  4202  				Consumer: ca.Name,
  4203  				Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}},
  4204  			}
  4205  			result.Response.Error = NewJSStreamNotFoundError()
  4206  			s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result)
  4207  			js.mu.Unlock()
  4208  		}
  4209  		return
  4210  	}
  4211  
  4212  	// Check if we already have this consumer running.
  4213  	o := mset.lookupConsumer(consumer)
  4214  
  4215  	if !alreadyRunning {
  4216  		// Process the raft group and make sure its running if needed.
  4217  		storage := mset.config().Storage
  4218  		if ca.Config.MemoryStorage {
  4219  			storage = MemoryStorage
  4220  		}
  4221  		// No-op if R1.
  4222  		js.createRaftGroup(accName, rg, storage, pprofLabels{
  4223  			"type":     "consumer",
  4224  			"account":  mset.accName(),
  4225  			"stream":   ca.Stream,
  4226  			"consumer": ca.Name,
  4227  		})
  4228  	} else {
  4229  		// If we are clustered update the known peers.
  4230  		js.mu.RLock()
  4231  		if node := rg.node; node != nil {
  4232  			node.UpdateKnownPeers(ca.Group.Peers)
  4233  		}
  4234  		js.mu.RUnlock()
  4235  	}
  4236  
  4237  	// Check if we already have this consumer running.
  4238  	var didCreate, isConfigUpdate, needsLocalResponse bool
  4239  	if o == nil {
  4240  		// Add in the consumer if needed.
  4241  		if o, err = mset.addConsumerWithAssignment(ca.Config, ca.Name, ca, wasExisting, ActionCreateOrUpdate); err == nil {
  4242  			didCreate = true
  4243  		}
  4244  	} else {
  4245  		// This consumer exists.
  4246  		// Only update if config is really different.
  4247  		cfg := o.config()
  4248  		if isConfigUpdate = !reflect.DeepEqual(&cfg, ca.Config); isConfigUpdate {
  4249  			// Call into update, ignore consumer exists error here since this means an old deliver subject is bound
  4250  			// which can happen on restart etc.
  4251  			if err := o.updateConfig(ca.Config); err != nil && err != NewJSConsumerNameExistError() {
  4252  				// This is essentially an update that has failed. Respond back to metaleader if we are not recovering.
  4253  				js.mu.RLock()
  4254  				if !js.metaRecovering {
  4255  					result := &consumerAssignmentResult{
  4256  						Account:  accName,
  4257  						Stream:   stream,
  4258  						Consumer: consumer,
  4259  						Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}},
  4260  					}
  4261  					result.Response.Error = NewJSConsumerNameExistError()
  4262  					s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result)
  4263  				}
  4264  				s.Warnf("Consumer create failed during update for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err)
  4265  				js.mu.RUnlock()
  4266  				return
  4267  			}
  4268  		}
  4269  
  4270  		var sendState bool
  4271  		js.mu.RLock()
  4272  		n := rg.node
  4273  		// Check if we already had a consumer assignment and its still pending.
  4274  		cca, oca := ca, o.consumerAssignment()
  4275  		if oca != nil {
  4276  			if !oca.responded {
  4277  				// We can't override info for replying here otherwise leader once elected can not respond.
  4278  				// So copy over original client and the reply from the old ca.
  4279  				cac := *ca
  4280  				cac.Client = oca.Client
  4281  				cac.Reply = oca.Reply
  4282  				cca = &cac
  4283  				needsLocalResponse = true
  4284  			}
  4285  			// If we look like we are scaling up, let's send our current state to the group.
  4286  			sendState = len(ca.Group.Peers) > len(oca.Group.Peers) && o.IsLeader() && n != nil
  4287  			// Signal that this is an update
  4288  			if ca.Reply != _EMPTY_ {
  4289  				isConfigUpdate = true
  4290  			}
  4291  		}
  4292  		js.mu.RUnlock()
  4293  
  4294  		if sendState {
  4295  			if snap, err := o.store.EncodedState(); err == nil {
  4296  				n.SendSnapshot(snap)
  4297  			}
  4298  		}
  4299  
  4300  		// Set CA for our consumer.
  4301  		o.setConsumerAssignment(cca)
  4302  		s.Debugf("JetStream cluster, consumer '%s > %s > %s' was already running", ca.Client.serviceAccount(), ca.Stream, ca.Name)
  4303  	}
  4304  
  4305  	// If we have an initial state set apply that now.
  4306  	if state != nil && o != nil {
  4307  		o.mu.Lock()
  4308  		err = o.setStoreState(state)
  4309  		o.mu.Unlock()
  4310  	}
  4311  
  4312  	if err != nil {
  4313  		if IsNatsErr(err, JSConsumerStoreFailedErrF) {
  4314  			s.Warnf("Consumer create failed for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err)
  4315  			err = errConsumerStoreFailed
  4316  		}
  4317  
  4318  		js.mu.Lock()
  4319  
  4320  		ca.err = err
  4321  		hasResponded := ca.responded
  4322  
  4323  		// If out of space do nothing for now.
  4324  		if isOutOfSpaceErr(err) {
  4325  			hasResponded = true
  4326  		}
  4327  
  4328  		if rg.node != nil {
  4329  			rg.node.Delete()
  4330  			// Clear the node here.
  4331  			rg.node = nil
  4332  		}
  4333  
  4334  		// If we did seem to create a consumer make sure to stop it.
  4335  		if o != nil {
  4336  			o.stop()
  4337  		}
  4338  
  4339  		var result *consumerAssignmentResult
  4340  		if !hasResponded && !js.metaRecovering {
  4341  			result = &consumerAssignmentResult{
  4342  				Account:  ca.Client.serviceAccount(),
  4343  				Stream:   ca.Stream,
  4344  				Consumer: ca.Name,
  4345  				Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}},
  4346  			}
  4347  			result.Response.Error = NewJSConsumerCreateError(err, Unless(err))
  4348  		} else if err == errNoInterest {
  4349  			// This is a stranded ephemeral, let's clean this one up.
  4350  			subject := fmt.Sprintf(JSApiConsumerDeleteT, ca.Stream, ca.Name)
  4351  			mset.outq.send(newJSPubMsg(subject, _EMPTY_, _EMPTY_, nil, nil, nil, 0))
  4352  		}
  4353  		js.mu.Unlock()
  4354  
  4355  		if result != nil {
  4356  			// Send response to the metadata leader. They will forward to the user as needed.
  4357  			b, _ := json.Marshal(result) // Avoids auto-processing and doing fancy json with newlines.
  4358  			s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, b)
  4359  		}
  4360  	} else {
  4361  		if didCreate {
  4362  			o.setCreatedTime(ca.Created)
  4363  		} else {
  4364  			// Check for scale down to 1..
  4365  			if rg.node != nil && len(rg.Peers) == 1 {
  4366  				o.clearNode()
  4367  				o.setLeader(true)
  4368  				// Need to clear from rg too.
  4369  				js.mu.Lock()
  4370  				rg.node = nil
  4371  				client, subject, reply := ca.Client, ca.Subject, ca.Reply
  4372  				js.mu.Unlock()
  4373  				var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}
  4374  				resp.ConsumerInfo = o.info()
  4375  				s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  4376  				return
  4377  			}
  4378  		}
  4379  
  4380  		if rg.node == nil {
  4381  			// Single replica consumer, process manually here.
  4382  			js.mu.Lock()
  4383  			// Force response in case we think this is an update.
  4384  			if !js.metaRecovering && isConfigUpdate {
  4385  				ca.responded = false
  4386  			}
  4387  			js.mu.Unlock()
  4388  			js.processConsumerLeaderChange(o, true)
  4389  		} else {
  4390  			// Clustered consumer.
  4391  			// Start our monitoring routine if needed.
  4392  			if !alreadyRunning && o.shouldStartMonitor() {
  4393  				s.startGoRoutine(
  4394  					func() { js.monitorConsumer(o, ca) },
  4395  					pprofLabels{
  4396  						"type":     "consumer",
  4397  						"account":  mset.accName(),
  4398  						"stream":   mset.name(),
  4399  						"consumer": ca.Name,
  4400  					},
  4401  				)
  4402  			}
  4403  			// For existing consumer, only send response if not recovering.
  4404  			if wasExisting && !js.isMetaRecovering() {
  4405  				if o.IsLeader() || (!didCreate && needsLocalResponse) {
  4406  					// Process if existing as an update. Double check that this is not recovered.
  4407  					js.mu.RLock()
  4408  					client, subject, reply, recovering := ca.Client, ca.Subject, ca.Reply, ca.recovering
  4409  					js.mu.RUnlock()
  4410  					if !recovering {
  4411  						var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}
  4412  						resp.ConsumerInfo = o.info()
  4413  						s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  4414  					}
  4415  				}
  4416  			}
  4417  		}
  4418  	}
  4419  }
  4420  
  4421  func (js *jetStream) processClusterDeleteConsumer(ca *consumerAssignment, isMember, wasLeader bool) {
  4422  	if ca == nil {
  4423  		return
  4424  	}
  4425  	js.mu.RLock()
  4426  	s := js.srv
  4427  	node := ca.Group.node
  4428  	offline := s.allPeersOffline(ca.Group)
  4429  	var isMetaLeader bool
  4430  	if cc := js.cluster; cc != nil {
  4431  		isMetaLeader = cc.isLeader()
  4432  	}
  4433  	recovering := ca.recovering
  4434  	js.mu.RUnlock()
  4435  
  4436  	var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}}
  4437  	var err error
  4438  	var acc *Account
  4439  
  4440  	// Go ahead and delete the consumer if we have it and the account.
  4441  	if acc, _ = s.LookupAccount(ca.Client.serviceAccount()); acc != nil {
  4442  		if mset, _ := acc.lookupStream(ca.Stream); mset != nil {
  4443  			if o := mset.lookupConsumer(ca.Name); o != nil {
  4444  				err = o.stopWithFlags(true, false, true, wasLeader)
  4445  			}
  4446  		}
  4447  	} else if ca.Group != nil {
  4448  		// We have a missing account, see if we can cleanup.
  4449  		if sacc := s.SystemAccount(); sacc != nil {
  4450  			os.RemoveAll(filepath.Join(js.config.StoreDir, sacc.GetName(), defaultStoreDirName, ca.Group.Name))
  4451  		}
  4452  	}
  4453  
  4454  	// Always delete the node if present.
  4455  	if node != nil {
  4456  		node.Delete()
  4457  	}
  4458  
  4459  	if !wasLeader || ca.Reply == _EMPTY_ {
  4460  		if !(offline && isMetaLeader) {
  4461  			return
  4462  		}
  4463  	}
  4464  
  4465  	// Do not respond if the account does not exist any longer or this is during recovery.
  4466  	if acc == nil || recovering {
  4467  		return
  4468  	}
  4469  
  4470  	if err != nil {
  4471  		resp.Error = NewJSStreamNotFoundError(Unless(err))
  4472  		s.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp))
  4473  	} else {
  4474  		resp.Success = true
  4475  		s.sendAPIResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp))
  4476  	}
  4477  }
  4478  
  4479  // Returns the consumer assignment, or nil if not present.
  4480  // Lock should be held.
  4481  func (js *jetStream) consumerAssignment(account, stream, consumer string) *consumerAssignment {
  4482  	if sa := js.streamAssignment(account, stream); sa != nil {
  4483  		return sa.consumers[consumer]
  4484  	}
  4485  	return nil
  4486  }
  4487  
  4488  // consumerAssigned informs us if this server has this consumer assigned.
  4489  func (jsa *jsAccount) consumerAssigned(stream, consumer string) bool {
  4490  	jsa.mu.RLock()
  4491  	js, acc := jsa.js, jsa.account
  4492  	jsa.mu.RUnlock()
  4493  
  4494  	if js == nil {
  4495  		return false
  4496  	}
  4497  	js.mu.RLock()
  4498  	defer js.mu.RUnlock()
  4499  	return js.cluster.isConsumerAssigned(acc, stream, consumer)
  4500  }
  4501  
  4502  // Read lock should be held.
  4503  func (cc *jetStreamCluster) isConsumerAssigned(a *Account, stream, consumer string) bool {
  4504  	// Non-clustered mode always return true.
  4505  	if cc == nil {
  4506  		return true
  4507  	}
  4508  	if cc.meta == nil {
  4509  		return false
  4510  	}
  4511  	var sa *streamAssignment
  4512  	accStreams := cc.streams[a.Name]
  4513  	if accStreams != nil {
  4514  		sa = accStreams[stream]
  4515  	}
  4516  	if sa == nil {
  4517  		// TODO(dlc) - This should not happen.
  4518  		return false
  4519  	}
  4520  	ca := sa.consumers[consumer]
  4521  	if ca == nil {
  4522  		return false
  4523  	}
  4524  	rg := ca.Group
  4525  	// Check if we are the leader of this raftGroup assigned to the stream.
  4526  	ourID := cc.meta.ID()
  4527  	for _, peer := range rg.Peers {
  4528  		if peer == ourID {
  4529  			return true
  4530  		}
  4531  	}
  4532  	return false
  4533  }
  4534  
  4535  // Returns our stream and underlying raft node.
  4536  func (o *consumer) streamAndNode() (*stream, RaftNode) {
  4537  	if o == nil {
  4538  		return nil, nil
  4539  	}
  4540  	o.mu.RLock()
  4541  	defer o.mu.RUnlock()
  4542  	return o.mset, o.node
  4543  }
  4544  
  4545  // Return the replica count for this consumer. If the consumer has been
  4546  // stopped, this will return an error.
  4547  func (o *consumer) replica() (int, error) {
  4548  	o.mu.RLock()
  4549  	oCfg := o.cfg
  4550  	mset := o.mset
  4551  	o.mu.RUnlock()
  4552  	if mset == nil {
  4553  		return 0, errBadConsumer
  4554  	}
  4555  	sCfg := mset.config()
  4556  	return oCfg.replicas(&sCfg), nil
  4557  }
  4558  
  4559  func (o *consumer) raftGroup() *raftGroup {
  4560  	if o == nil {
  4561  		return nil
  4562  	}
  4563  	o.mu.RLock()
  4564  	defer o.mu.RUnlock()
  4565  	if o.ca == nil {
  4566  		return nil
  4567  	}
  4568  	return o.ca.Group
  4569  }
  4570  
  4571  func (o *consumer) clearRaftNode() {
  4572  	if o == nil {
  4573  		return
  4574  	}
  4575  	o.mu.Lock()
  4576  	defer o.mu.Unlock()
  4577  	o.node = nil
  4578  }
  4579  
  4580  func (o *consumer) raftNode() RaftNode {
  4581  	if o == nil {
  4582  		return nil
  4583  	}
  4584  	o.mu.RLock()
  4585  	defer o.mu.RUnlock()
  4586  	return o.node
  4587  }
  4588  
  4589  func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
  4590  	s, n, cc := js.server(), o.raftNode(), js.cluster
  4591  	defer s.grWG.Done()
  4592  
  4593  	defer o.clearMonitorRunning()
  4594  
  4595  	if n == nil {
  4596  		s.Warnf("No RAFT group for '%s > %s > %s'", o.acc.Name, ca.Stream, ca.Name)
  4597  		return
  4598  	}
  4599  
  4600  	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
  4601  	// This should be below the checkInMonitor call though to avoid stopping it out
  4602  	// from underneath the one that is running since it will be the same raft node.
  4603  	defer n.Stop()
  4604  
  4605  	qch, lch, aq, uch, ourPeerId := n.QuitC(), n.LeadChangeC(), n.ApplyQ(), o.updateC(), cc.meta.ID()
  4606  
  4607  	s.Debugf("Starting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group())
  4608  	defer s.Debugf("Exiting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group())
  4609  
  4610  	const (
  4611  		compactInterval = 2 * time.Minute
  4612  		compactSizeMin  = 64 * 1024 // What is stored here is always small for consumers.
  4613  		compactNumMin   = 1024
  4614  		minSnapDelta    = 10 * time.Second
  4615  	)
  4616  
  4617  	// Spread these out for large numbers on server restart.
  4618  	rci := time.Duration(rand.Int63n(int64(time.Minute)))
  4619  	t := time.NewTicker(compactInterval + rci)
  4620  	defer t.Stop()
  4621  
  4622  	// Highwayhash key for generating hashes.
  4623  	key := make([]byte, 32)
  4624  	crand.Read(key)
  4625  
  4626  	// Hash of the last snapshot (fixed size in memory).
  4627  	var lastSnap []byte
  4628  	var lastSnapTime time.Time
  4629  
  4630  	// Don't allow the upper layer to install snapshots until we have
  4631  	// fully recovered from disk.
  4632  	recovering := true
  4633  
  4634  	doSnapshot := func(force bool) {
  4635  		// Bail if trying too fast and not in a forced situation.
  4636  		if recovering || (!force && time.Since(lastSnapTime) < minSnapDelta) {
  4637  			return
  4638  		}
  4639  
  4640  		// Check several things to see if we need a snapshot.
  4641  		ne, nb := n.Size()
  4642  		if !n.NeedSnapshot() {
  4643  			// Check if we should compact etc. based on size of log.
  4644  			if !force && ne < compactNumMin && nb < compactSizeMin {
  4645  				return
  4646  			}
  4647  		}
  4648  
  4649  		if snap, err := o.store.EncodedState(); err == nil {
  4650  			hash := highwayhash.Sum(snap, key)
  4651  			// If the state hasn't changed but the log has gone way over
  4652  			// the compaction size then we will want to compact anyway.
  4653  			// This can happen for example when a pull consumer fetches a
  4654  			// lot on an idle stream, log entries get distributed but the
  4655  			// state never changes, therefore the log never gets compacted.
  4656  			if !bytes.Equal(hash[:], lastSnap) || ne >= compactNumMin || nb >= compactSizeMin {
  4657  				if err := n.InstallSnapshot(snap); err == nil {
  4658  					lastSnap, lastSnapTime = hash[:], time.Now()
  4659  				} else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning {
  4660  					s.RateLimitWarnf("Failed to install snapshot for '%s > %s > %s' [%s]: %v", o.acc.Name, ca.Stream, ca.Name, n.Group(), err)
  4661  				}
  4662  			}
  4663  		}
  4664  	}
  4665  
  4666  	// For migration tracking.
  4667  	var mmt *time.Ticker
  4668  	var mmtc <-chan time.Time
  4669  
  4670  	startMigrationMonitoring := func() {
  4671  		if mmt == nil {
  4672  			mmt = time.NewTicker(500 * time.Millisecond)
  4673  			mmtc = mmt.C
  4674  		}
  4675  	}
  4676  
  4677  	stopMigrationMonitoring := func() {
  4678  		if mmt != nil {
  4679  			mmt.Stop()
  4680  			mmt, mmtc = nil, nil
  4681  		}
  4682  	}
  4683  	defer stopMigrationMonitoring()
  4684  
  4685  	// Track if we are leader.
  4686  	var isLeader bool
  4687  
  4688  	for {
  4689  		select {
  4690  		case <-s.quitCh:
  4691  			return
  4692  		case <-qch:
  4693  			return
  4694  		case <-aq.ch:
  4695  			ces := aq.pop()
  4696  			for _, ce := range ces {
  4697  				// No special processing needed for when we are caught up on restart.
  4698  				if ce == nil {
  4699  					recovering = false
  4700  					if n.NeedSnapshot() {
  4701  						doSnapshot(true)
  4702  					}
  4703  					// Check our state if we are under an interest based stream.
  4704  					o.checkStateForInterestStream()
  4705  				} else if err := js.applyConsumerEntries(o, ce, isLeader); err == nil {
  4706  					ne, nb := n.Applied(ce.Index)
  4707  					ce.ReturnToPool()
  4708  					// If we have at least min entries to compact, go ahead and snapshot/compact.
  4709  					if nb > 0 && ne >= compactNumMin || nb > compactSizeMin {
  4710  						doSnapshot(false)
  4711  					}
  4712  				} else {
  4713  					s.Warnf("Error applying consumer entries to '%s > %s'", ca.Client.serviceAccount(), ca.Name)
  4714  				}
  4715  			}
  4716  			aq.recycle(&ces)
  4717  		case isLeader = <-lch:
  4718  			if recovering && !isLeader {
  4719  				js.setConsumerAssignmentRecovering(ca)
  4720  			}
  4721  
  4722  			// Process the change.
  4723  			if err := js.processConsumerLeaderChange(o, isLeader); err == nil && isLeader {
  4724  				doSnapshot(true)
  4725  			}
  4726  
  4727  			// We may receive a leader change after the consumer assignment which would cancel us
  4728  			// monitoring for this closely. So re-assess our state here as well.
  4729  			// Or the old leader is no longer part of the set and transferred leadership
  4730  			// for this leader to resume with removal
  4731  			rg := o.raftGroup()
  4732  
  4733  			// Check for migrations (peer count and replica count differ) here.
  4734  			// We set the state on the stream assignment update below.
  4735  			replicas, err := o.replica()
  4736  			if err != nil {
  4737  				continue
  4738  			}
  4739  			if isLeader && len(rg.Peers) != replicas {
  4740  				startMigrationMonitoring()
  4741  			} else {
  4742  				stopMigrationMonitoring()
  4743  			}
  4744  		case <-uch:
  4745  			// keep consumer assignment current
  4746  			ca = o.consumerAssignment()
  4747  			// We get this when we have a new consumer assignment caused by an update.
  4748  			// We want to know if we are migrating.
  4749  			rg := o.raftGroup()
  4750  			// keep peer list up to date with config
  4751  			js.checkPeers(rg)
  4752  			// If we are migrating, monitor for the new peers to be caught up.
  4753  			replicas, err := o.replica()
  4754  			if err != nil {
  4755  				continue
  4756  			}
  4757  			if isLeader && len(rg.Peers) != replicas {
  4758  				startMigrationMonitoring()
  4759  			} else {
  4760  				stopMigrationMonitoring()
  4761  			}
  4762  		case <-mmtc:
  4763  			if !isLeader {
  4764  				// We are no longer leader, so not our job.
  4765  				stopMigrationMonitoring()
  4766  				continue
  4767  			}
  4768  			rg := o.raftGroup()
  4769  			ci := js.clusterInfo(rg)
  4770  			replicas, err := o.replica()
  4771  			if err != nil {
  4772  				continue
  4773  			}
  4774  			if len(rg.Peers) <= replicas {
  4775  				// Migration no longer happening, so not our job anymore
  4776  				stopMigrationMonitoring()
  4777  				continue
  4778  			}
  4779  			newPeers, oldPeers, newPeerSet, _ := genPeerInfo(rg.Peers, len(rg.Peers)-replicas)
  4780  
  4781  			// If we are part of the new peerset and we have been passed the baton.
  4782  			// We will handle scale down.
  4783  			if newPeerSet[ourPeerId] {
  4784  				for _, p := range oldPeers {
  4785  					n.ProposeRemovePeer(p)
  4786  				}
  4787  				cca := ca.copyGroup()
  4788  				cca.Group.Peers = newPeers
  4789  				cca.Group.Cluster = s.cachedClusterName()
  4790  				cc.meta.ForwardProposal(encodeAddConsumerAssignment(cca))
  4791  				s.Noticef("Scaling down '%s > %s > %s' to %+v", ca.Client.serviceAccount(), ca.Stream, ca.Name, s.peerSetToNames(newPeers))
  4792  
  4793  			} else {
  4794  				var newLeaderPeer, newLeader, newCluster string
  4795  				neededCurrent, current := replicas/2+1, 0
  4796  				for _, r := range ci.Replicas {
  4797  					if r.Current && newPeerSet[r.Peer] {
  4798  						current++
  4799  						if newCluster == _EMPTY_ {
  4800  							newLeaderPeer, newLeader, newCluster = r.Peer, r.Name, r.cluster
  4801  						}
  4802  					}
  4803  				}
  4804  
  4805  				// Check if we have a quorom
  4806  				if current >= neededCurrent {
  4807  					s.Noticef("Transfer of consumer leader for '%s > %s > %s' to '%s'", ca.Client.serviceAccount(), ca.Stream, ca.Name, newLeader)
  4808  					n.StepDown(newLeaderPeer)
  4809  				}
  4810  			}
  4811  
  4812  		case <-t.C:
  4813  			doSnapshot(false)
  4814  		}
  4815  	}
  4816  }
  4817  
  4818  func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLeader bool) error {
  4819  	for _, e := range ce.Entries {
  4820  		if e.Type == EntrySnapshot {
  4821  			if !isLeader {
  4822  				// No-op needed?
  4823  				state, err := decodeConsumerState(e.Data)
  4824  				if err != nil {
  4825  					if mset, node := o.streamAndNode(); mset != nil && node != nil {
  4826  						s := js.srv
  4827  						s.Errorf("JetStream cluster could not decode consumer snapshot for '%s > %s > %s' [%s]",
  4828  							mset.account(), mset.name(), o, node.Group())
  4829  					}
  4830  					panic(err.Error())
  4831  				}
  4832  				if err = o.store.Update(state); err != nil {
  4833  					o.mu.RLock()
  4834  					s, acc, mset, name := o.srv, o.acc, o.mset, o.name
  4835  					o.mu.RUnlock()
  4836  					if s != nil && mset != nil {
  4837  						s.Warnf("Consumer '%s > %s > %s' error on store update from snapshot entry: %v", acc, mset.name(), name, err)
  4838  					}
  4839  				} else {
  4840  					o.checkStateForInterestStream()
  4841  				}
  4842  			}
  4843  
  4844  		} else if e.Type == EntryRemovePeer {
  4845  			js.mu.RLock()
  4846  			var ourID string
  4847  			if js.cluster != nil && js.cluster.meta != nil {
  4848  				ourID = js.cluster.meta.ID()
  4849  			}
  4850  			js.mu.RUnlock()
  4851  			if peer := string(e.Data); peer == ourID {
  4852  				shouldRemove := true
  4853  				if mset := o.getStream(); mset != nil {
  4854  					if sa := mset.streamAssignment(); sa != nil && sa.Group != nil {
  4855  						js.mu.RLock()
  4856  						shouldRemove = !sa.Group.isMember(ourID)
  4857  						js.mu.RUnlock()
  4858  					}
  4859  				}
  4860  				if shouldRemove {
  4861  					o.stopWithFlags(true, false, false, false)
  4862  				}
  4863  			}
  4864  			return nil
  4865  		} else if e.Type == EntryAddPeer {
  4866  			// Ignore for now.
  4867  		} else {
  4868  			buf := e.Data
  4869  			switch entryOp(buf[0]) {
  4870  			case updateDeliveredOp:
  4871  				// These are handled in place in leaders.
  4872  				if !isLeader {
  4873  					dseq, sseq, dc, ts, err := decodeDeliveredUpdate(buf[1:])
  4874  					if err != nil {
  4875  						if mset, node := o.streamAndNode(); mset != nil && node != nil {
  4876  							s := js.srv
  4877  							s.Errorf("JetStream cluster could not decode consumer delivered update for '%s > %s > %s' [%s]",
  4878  								mset.account(), mset.name(), o, node.Group())
  4879  						}
  4880  						panic(err.Error())
  4881  					}
  4882  					// Make sure to update delivered under the lock.
  4883  					o.mu.Lock()
  4884  					err = o.store.UpdateDelivered(dseq, sseq, dc, ts)
  4885  					o.ldt = time.Now()
  4886  					o.mu.Unlock()
  4887  					if err != nil {
  4888  						panic(err.Error())
  4889  					}
  4890  				}
  4891  			case updateAcksOp:
  4892  				dseq, sseq, err := decodeAckUpdate(buf[1:])
  4893  				if err != nil {
  4894  					if mset, node := o.streamAndNode(); mset != nil && node != nil {
  4895  						s := js.srv
  4896  						s.Errorf("JetStream cluster could not decode consumer ack update for '%s > %s > %s' [%s]",
  4897  							mset.account(), mset.name(), o, node.Group())
  4898  					}
  4899  					panic(err.Error())
  4900  				}
  4901  				o.processReplicatedAck(dseq, sseq)
  4902  			case updateSkipOp:
  4903  				o.mu.Lock()
  4904  				if !o.isLeader() {
  4905  					var le = binary.LittleEndian
  4906  					if sseq := le.Uint64(buf[1:]); sseq > o.sseq {
  4907  						o.sseq = sseq
  4908  					}
  4909  				}
  4910  				o.mu.Unlock()
  4911  			case addPendingRequest:
  4912  				o.mu.Lock()
  4913  				if !o.isLeader() {
  4914  					if o.prm == nil {
  4915  						o.prm = make(map[string]struct{})
  4916  					}
  4917  					o.prm[string(buf[1:])] = struct{}{}
  4918  				}
  4919  				o.mu.Unlock()
  4920  			case removePendingRequest:
  4921  				o.mu.Lock()
  4922  				if !o.isLeader() {
  4923  					if o.prm != nil {
  4924  						delete(o.prm, string(buf[1:]))
  4925  					}
  4926  				}
  4927  				o.mu.Unlock()
  4928  			default:
  4929  				panic(fmt.Sprintf("JetStream Cluster Unknown group entry op type: %v", entryOp(buf[0])))
  4930  			}
  4931  		}
  4932  	}
  4933  	return nil
  4934  }
  4935  
  4936  func (o *consumer) processReplicatedAck(dseq, sseq uint64) {
  4937  	o.mu.Lock()
  4938  
  4939  	mset := o.mset
  4940  	if o.closed || mset == nil {
  4941  		o.mu.Unlock()
  4942  		return
  4943  	}
  4944  
  4945  	// Update activity.
  4946  	o.lat = time.Now()
  4947  
  4948  	// Do actual ack update to store.
  4949  	o.store.UpdateAcks(dseq, sseq)
  4950  
  4951  	if o.retention == LimitsPolicy {
  4952  		o.mu.Unlock()
  4953  		return
  4954  	}
  4955  
  4956  	var sagap uint64
  4957  	if o.cfg.AckPolicy == AckAll {
  4958  		if o.isLeader() {
  4959  			sagap = sseq - o.asflr
  4960  		} else {
  4961  			// We are a follower so only have the store state, so read that in.
  4962  			state, err := o.store.State()
  4963  			if err != nil {
  4964  				o.mu.Unlock()
  4965  				return
  4966  			}
  4967  			sagap = sseq - state.AckFloor.Stream
  4968  		}
  4969  	}
  4970  	o.mu.Unlock()
  4971  
  4972  	if sagap > 1 {
  4973  		// FIXME(dlc) - This is very inefficient, will need to fix.
  4974  		for seq := sseq; seq > sseq-sagap; seq-- {
  4975  			mset.ackMsg(o, seq)
  4976  		}
  4977  	} else {
  4978  		mset.ackMsg(o, sseq)
  4979  	}
  4980  }
  4981  
  4982  var errBadAckUpdate = errors.New("jetstream cluster bad replicated ack update")
  4983  var errBadDeliveredUpdate = errors.New("jetstream cluster bad replicated delivered update")
  4984  
  4985  func decodeAckUpdate(buf []byte) (dseq, sseq uint64, err error) {
  4986  	var bi, n int
  4987  	if dseq, n = binary.Uvarint(buf); n < 0 {
  4988  		return 0, 0, errBadAckUpdate
  4989  	}
  4990  	bi += n
  4991  	if sseq, n = binary.Uvarint(buf[bi:]); n < 0 {
  4992  		return 0, 0, errBadAckUpdate
  4993  	}
  4994  	return dseq, sseq, nil
  4995  }
  4996  
  4997  func decodeDeliveredUpdate(buf []byte) (dseq, sseq, dc uint64, ts int64, err error) {
  4998  	var bi, n int
  4999  	if dseq, n = binary.Uvarint(buf); n < 0 {
  5000  		return 0, 0, 0, 0, errBadDeliveredUpdate
  5001  	}
  5002  	bi += n
  5003  	if sseq, n = binary.Uvarint(buf[bi:]); n < 0 {
  5004  		return 0, 0, 0, 0, errBadDeliveredUpdate
  5005  	}
  5006  	bi += n
  5007  	if dc, n = binary.Uvarint(buf[bi:]); n < 0 {
  5008  		return 0, 0, 0, 0, errBadDeliveredUpdate
  5009  	}
  5010  	bi += n
  5011  	if ts, n = binary.Varint(buf[bi:]); n < 0 {
  5012  		return 0, 0, 0, 0, errBadDeliveredUpdate
  5013  	}
  5014  	return dseq, sseq, dc, ts, nil
  5015  }
  5016  
  5017  func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) error {
  5018  	stepDownIfLeader := func() error {
  5019  		if node := o.raftNode(); node != nil && isLeader {
  5020  			node.StepDown()
  5021  		}
  5022  		return errors.New("failed to update consumer leader status")
  5023  	}
  5024  
  5025  	if o == nil || o.isClosed() {
  5026  		return stepDownIfLeader()
  5027  	}
  5028  
  5029  	ca := o.consumerAssignment()
  5030  	if ca == nil {
  5031  		return stepDownIfLeader()
  5032  	}
  5033  	js.mu.Lock()
  5034  	s, account, err := js.srv, ca.Client.serviceAccount(), ca.err
  5035  	client, subject, reply, streamName, consumerName := ca.Client, ca.Subject, ca.Reply, ca.Stream, ca.Name
  5036  	hasResponded := ca.responded
  5037  	ca.responded = true
  5038  	js.mu.Unlock()
  5039  
  5040  	acc, _ := s.LookupAccount(account)
  5041  	if acc == nil {
  5042  		return stepDownIfLeader()
  5043  	}
  5044  
  5045  	if isLeader {
  5046  		s.Noticef("JetStream cluster new consumer leader for '%s > %s > %s'", ca.Client.serviceAccount(), streamName, consumerName)
  5047  		s.sendConsumerLeaderElectAdvisory(o)
  5048  		// Check for peer removal and process here if needed.
  5049  		js.checkPeers(ca.Group)
  5050  	} else {
  5051  		// We are stepping down.
  5052  		// Make sure if we are doing so because we have lost quorum that we send the appropriate advisories.
  5053  		if node := o.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second {
  5054  			s.sendConsumerLostQuorumAdvisory(o)
  5055  		}
  5056  	}
  5057  
  5058  	// Tell consumer to switch leader status.
  5059  	o.setLeader(isLeader)
  5060  
  5061  	if !isLeader || hasResponded {
  5062  		if isLeader {
  5063  			o.clearInitialInfo()
  5064  		}
  5065  		return nil
  5066  	}
  5067  
  5068  	var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}
  5069  	if err != nil {
  5070  		resp.Error = NewJSConsumerCreateError(err, Unless(err))
  5071  		s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  5072  	} else {
  5073  		resp.ConsumerInfo = o.initialInfo()
  5074  		s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  5075  		if node := o.raftNode(); node != nil {
  5076  			o.sendCreateAdvisory()
  5077  		}
  5078  	}
  5079  
  5080  	// Only send a pause advisory on consumer create if we're
  5081  	// actually paused. The timer would have been kicked by now
  5082  	// by the call to o.setLeader() above.
  5083  	if isLeader && o.cfg.PauseUntil != nil && !o.cfg.PauseUntil.IsZero() && time.Now().Before(*o.cfg.PauseUntil) {
  5084  		o.sendPauseAdvisoryLocked(&o.cfg)
  5085  	}
  5086  
  5087  	return nil
  5088  }
  5089  
  5090  // Determines if we should send lost quorum advisory. We throttle these after first one.
  5091  func (o *consumer) shouldSendLostQuorum() bool {
  5092  	o.mu.Lock()
  5093  	defer o.mu.Unlock()
  5094  	if time.Since(o.lqsent) >= lostQuorumAdvInterval {
  5095  		o.lqsent = time.Now()
  5096  		return true
  5097  	}
  5098  	return false
  5099  }
  5100  
  5101  func (s *Server) sendConsumerLostQuorumAdvisory(o *consumer) {
  5102  	if o == nil {
  5103  		return
  5104  	}
  5105  	node, stream, consumer, acc := o.raftNode(), o.streamName(), o.String(), o.account()
  5106  	if node == nil {
  5107  		return
  5108  	}
  5109  	if !o.shouldSendLostQuorum() {
  5110  		return
  5111  	}
  5112  
  5113  	s.Warnf("JetStream cluster consumer '%s > %s > %s' has NO quorum, stalled.", acc.GetName(), stream, consumer)
  5114  
  5115  	subj := JSAdvisoryConsumerQuorumLostPre + "." + stream + "." + consumer
  5116  	adv := &JSConsumerQuorumLostAdvisory{
  5117  		TypedEvent: TypedEvent{
  5118  			Type: JSConsumerQuorumLostAdvisoryType,
  5119  			ID:   nuid.Next(),
  5120  			Time: time.Now().UTC(),
  5121  		},
  5122  		Stream:   stream,
  5123  		Consumer: consumer,
  5124  		Replicas: s.replicas(node),
  5125  		Domain:   s.getOpts().JetStreamDomain,
  5126  	}
  5127  
  5128  	// Send to the user's account if not the system account.
  5129  	if acc != s.SystemAccount() {
  5130  		s.publishAdvisory(acc, subj, adv)
  5131  	}
  5132  	// Now do system level one. Place account info in adv, and nil account means system.
  5133  	adv.Account = acc.GetName()
  5134  	s.publishAdvisory(nil, subj, adv)
  5135  }
  5136  
  5137  func (s *Server) sendConsumerLeaderElectAdvisory(o *consumer) {
  5138  	if o == nil {
  5139  		return
  5140  	}
  5141  	node, stream, consumer, acc := o.raftNode(), o.streamName(), o.String(), o.account()
  5142  	if node == nil {
  5143  		return
  5144  	}
  5145  
  5146  	subj := JSAdvisoryConsumerLeaderElectedPre + "." + stream + "." + consumer
  5147  	adv := &JSConsumerLeaderElectedAdvisory{
  5148  		TypedEvent: TypedEvent{
  5149  			Type: JSConsumerLeaderElectedAdvisoryType,
  5150  			ID:   nuid.Next(),
  5151  			Time: time.Now().UTC(),
  5152  		},
  5153  		Stream:   stream,
  5154  		Consumer: consumer,
  5155  		Leader:   s.serverNameForNode(node.GroupLeader()),
  5156  		Replicas: s.replicas(node),
  5157  		Domain:   s.getOpts().JetStreamDomain,
  5158  	}
  5159  
  5160  	// Send to the user's account if not the system account.
  5161  	if acc != s.SystemAccount() {
  5162  		s.publishAdvisory(acc, subj, adv)
  5163  	}
  5164  	// Now do system level one. Place account info in adv, and nil account means system.
  5165  	adv.Account = acc.GetName()
  5166  	s.publishAdvisory(nil, subj, adv)
  5167  }
  5168  
  5169  type streamAssignmentResult struct {
  5170  	Account  string                      `json:"account"`
  5171  	Stream   string                      `json:"stream"`
  5172  	Response *JSApiStreamCreateResponse  `json:"create_response,omitempty"`
  5173  	Restore  *JSApiStreamRestoreResponse `json:"restore_response,omitempty"`
  5174  	Update   bool                        `json:"is_update,omitempty"`
  5175  }
  5176  
  5177  // Determine if this is an insufficient resources' error type.
  5178  func isInsufficientResourcesErr(resp *JSApiStreamCreateResponse) bool {
  5179  	return resp != nil && resp.Error != nil && IsNatsErr(resp.Error, JSInsufficientResourcesErr, JSMemoryResourcesExceededErr, JSStorageResourcesExceededErr)
  5180  }
  5181  
  5182  // Process error results of stream and consumer assignments.
  5183  // Success will be handled by stream leader.
  5184  func (js *jetStream) processStreamAssignmentResults(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  5185  	var result streamAssignmentResult
  5186  	if err := json.Unmarshal(msg, &result); err != nil {
  5187  		// TODO(dlc) - log
  5188  		return
  5189  	}
  5190  	acc, _ := js.srv.LookupAccount(result.Account)
  5191  	if acc == nil {
  5192  		// TODO(dlc) - log
  5193  		return
  5194  	}
  5195  
  5196  	js.mu.Lock()
  5197  	defer js.mu.Unlock()
  5198  
  5199  	s, cc := js.srv, js.cluster
  5200  	if cc == nil || cc.meta == nil {
  5201  		return
  5202  	}
  5203  
  5204  	// This should have been done already in processStreamAssignment, but in
  5205  	// case we have a code path that gets here with no processStreamAssignment,
  5206  	// then we will do the proper thing. Otherwise will be a no-op.
  5207  	cc.removeInflightProposal(result.Account, result.Stream)
  5208  
  5209  	// FIXME(dlc) - suppress duplicates?
  5210  	if sa := js.streamAssignment(result.Account, result.Stream); sa != nil {
  5211  		canDelete := !result.Update && time.Since(sa.Created) < 5*time.Second
  5212  
  5213  		// See if we should retry in case this cluster is full but there are others.
  5214  		if cfg, ci := sa.Config, sa.Client; cfg != nil && ci != nil && isInsufficientResourcesErr(result.Response) && canDelete {
  5215  			// If cluster is defined we can not retry.
  5216  			if cfg.Placement == nil || cfg.Placement.Cluster == _EMPTY_ {
  5217  				// If we have additional clusters to try we can retry.
  5218  				if ci != nil && len(ci.Alternates) > 0 {
  5219  					if rg, err := js.createGroupForStream(ci, cfg); err != nil {
  5220  						s.Warnf("Retrying cluster placement for stream '%s > %s' failed due to placement error: %+v", result.Account, result.Stream, err)
  5221  					} else {
  5222  						if org := sa.Group; org != nil && len(org.Peers) > 0 {
  5223  							s.Warnf("Retrying cluster placement for stream '%s > %s' due to insufficient resources in cluster %q",
  5224  								result.Account, result.Stream, s.clusterNameForNode(org.Peers[0]))
  5225  						} else {
  5226  							s.Warnf("Retrying cluster placement for stream '%s > %s' due to insufficient resources", result.Account, result.Stream)
  5227  						}
  5228  						// Pick a new preferred leader.
  5229  						rg.setPreferred()
  5230  						// Get rid of previous attempt.
  5231  						cc.meta.Propose(encodeDeleteStreamAssignment(sa))
  5232  						// Propose new.
  5233  						sa.Group, sa.err = rg, nil
  5234  						cc.meta.Propose(encodeAddStreamAssignment(sa))
  5235  						return
  5236  					}
  5237  				}
  5238  			}
  5239  		}
  5240  
  5241  		// Respond to the user here.
  5242  		var resp string
  5243  		if result.Response != nil {
  5244  			resp = s.jsonResponse(result.Response)
  5245  		} else if result.Restore != nil {
  5246  			resp = s.jsonResponse(result.Restore)
  5247  		}
  5248  		if !sa.responded || result.Update {
  5249  			sa.responded = true
  5250  			js.srv.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, resp)
  5251  		}
  5252  		// Remove this assignment if possible.
  5253  		if canDelete {
  5254  			sa.err = NewJSClusterNotAssignedError()
  5255  			cc.meta.Propose(encodeDeleteStreamAssignment(sa))
  5256  		}
  5257  	}
  5258  }
  5259  
  5260  func (js *jetStream) processConsumerAssignmentResults(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  5261  	var result consumerAssignmentResult
  5262  	if err := json.Unmarshal(msg, &result); err != nil {
  5263  		// TODO(dlc) - log
  5264  		return
  5265  	}
  5266  	acc, _ := js.srv.LookupAccount(result.Account)
  5267  	if acc == nil {
  5268  		// TODO(dlc) - log
  5269  		return
  5270  	}
  5271  
  5272  	js.mu.Lock()
  5273  	defer js.mu.Unlock()
  5274  
  5275  	s, cc := js.srv, js.cluster
  5276  	if cc == nil || cc.meta == nil {
  5277  		return
  5278  	}
  5279  
  5280  	if sa := js.streamAssignment(result.Account, result.Stream); sa != nil && sa.consumers != nil {
  5281  		if ca := sa.consumers[result.Consumer]; ca != nil && !ca.responded {
  5282  			js.srv.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(result.Response))
  5283  			ca.responded = true
  5284  
  5285  			// Check if this failed.
  5286  			// TODO(dlc) - Could have mixed results, should track per peer.
  5287  			// Make sure this is recent response, do not delete existing consumers.
  5288  			if result.Response.Error != nil && result.Response.Error != NewJSConsumerNameExistError() && time.Since(ca.Created) < 2*time.Second {
  5289  				// So while we are deleting we will not respond to list/names requests.
  5290  				ca.err = NewJSClusterNotAssignedError()
  5291  				cc.meta.Propose(encodeDeleteConsumerAssignment(ca))
  5292  				s.Warnf("Proposing to delete consumer `%s > %s > %s' due to assignment response error: %v",
  5293  					result.Account, result.Stream, result.Consumer, result.Response.Error)
  5294  			}
  5295  		}
  5296  	}
  5297  }
  5298  
  5299  const (
  5300  	streamAssignmentSubj   = "$SYS.JSC.STREAM.ASSIGNMENT.RESULT"
  5301  	consumerAssignmentSubj = "$SYS.JSC.CONSUMER.ASSIGNMENT.RESULT"
  5302  )
  5303  
  5304  // Lock should be held.
  5305  func (js *jetStream) startUpdatesSub() {
  5306  	cc, s, c := js.cluster, js.srv, js.cluster.c
  5307  	if cc.streamResults == nil {
  5308  		cc.streamResults, _ = s.systemSubscribe(streamAssignmentSubj, _EMPTY_, false, c, js.processStreamAssignmentResults)
  5309  	}
  5310  	if cc.consumerResults == nil {
  5311  		cc.consumerResults, _ = s.systemSubscribe(consumerAssignmentSubj, _EMPTY_, false, c, js.processConsumerAssignmentResults)
  5312  	}
  5313  	if cc.stepdown == nil {
  5314  		cc.stepdown, _ = s.systemSubscribe(JSApiLeaderStepDown, _EMPTY_, false, c, s.jsLeaderStepDownRequest)
  5315  	}
  5316  	if cc.peerRemove == nil {
  5317  		cc.peerRemove, _ = s.systemSubscribe(JSApiRemoveServer, _EMPTY_, false, c, s.jsLeaderServerRemoveRequest)
  5318  	}
  5319  	if cc.peerStreamMove == nil {
  5320  		cc.peerStreamMove, _ = s.systemSubscribe(JSApiServerStreamMove, _EMPTY_, false, c, s.jsLeaderServerStreamMoveRequest)
  5321  	}
  5322  	if cc.peerStreamCancelMove == nil {
  5323  		cc.peerStreamCancelMove, _ = s.systemSubscribe(JSApiServerStreamCancelMove, _EMPTY_, false, c, s.jsLeaderServerStreamCancelMoveRequest)
  5324  	}
  5325  	if js.accountPurge == nil {
  5326  		js.accountPurge, _ = s.systemSubscribe(JSApiAccountPurge, _EMPTY_, false, c, s.jsLeaderAccountPurgeRequest)
  5327  	}
  5328  }
  5329  
  5330  // Lock should be held.
  5331  func (js *jetStream) stopUpdatesSub() {
  5332  	cc := js.cluster
  5333  	if cc.streamResults != nil {
  5334  		cc.s.sysUnsubscribe(cc.streamResults)
  5335  		cc.streamResults = nil
  5336  	}
  5337  	if cc.consumerResults != nil {
  5338  		cc.s.sysUnsubscribe(cc.consumerResults)
  5339  		cc.consumerResults = nil
  5340  	}
  5341  	if cc.stepdown != nil {
  5342  		cc.s.sysUnsubscribe(cc.stepdown)
  5343  		cc.stepdown = nil
  5344  	}
  5345  	if cc.peerRemove != nil {
  5346  		cc.s.sysUnsubscribe(cc.peerRemove)
  5347  		cc.peerRemove = nil
  5348  	}
  5349  	if cc.peerStreamMove != nil {
  5350  		cc.s.sysUnsubscribe(cc.peerStreamMove)
  5351  		cc.peerStreamMove = nil
  5352  	}
  5353  	if cc.peerStreamCancelMove != nil {
  5354  		cc.s.sysUnsubscribe(cc.peerStreamCancelMove)
  5355  		cc.peerStreamCancelMove = nil
  5356  	}
  5357  	if js.accountPurge != nil {
  5358  		cc.s.sysUnsubscribe(js.accountPurge)
  5359  		js.accountPurge = nil
  5360  	}
  5361  }
  5362  
  5363  func (s *Server) sendDomainLeaderElectAdvisory() {
  5364  	js, cc := s.getJetStreamCluster()
  5365  	if js == nil || cc == nil {
  5366  		return
  5367  	}
  5368  
  5369  	js.mu.RLock()
  5370  	node := cc.meta
  5371  	js.mu.RUnlock()
  5372  
  5373  	adv := &JSDomainLeaderElectedAdvisory{
  5374  		TypedEvent: TypedEvent{
  5375  			Type: JSDomainLeaderElectedAdvisoryType,
  5376  			ID:   nuid.Next(),
  5377  			Time: time.Now().UTC(),
  5378  		},
  5379  		Leader:   node.GroupLeader(),
  5380  		Replicas: s.replicas(node),
  5381  		Cluster:  s.cachedClusterName(),
  5382  		Domain:   s.getOpts().JetStreamDomain,
  5383  	}
  5384  
  5385  	s.publishAdvisory(nil, JSAdvisoryDomainLeaderElected, adv)
  5386  }
  5387  
  5388  func (js *jetStream) processLeaderChange(isLeader bool) {
  5389  	if js == nil {
  5390  		return
  5391  	}
  5392  	s := js.srv
  5393  	if s == nil {
  5394  		return
  5395  	}
  5396  	// Update our server atomic.
  5397  	s.isMetaLeader.Store(isLeader)
  5398  
  5399  	if isLeader {
  5400  		s.Noticef("Self is new JetStream cluster metadata leader")
  5401  		s.sendDomainLeaderElectAdvisory()
  5402  	} else {
  5403  		var node string
  5404  		if meta := js.getMetaGroup(); meta != nil {
  5405  			node = meta.GroupLeader()
  5406  		}
  5407  		if node == _EMPTY_ {
  5408  			s.Noticef("JetStream cluster no metadata leader")
  5409  		} else if srv := js.srv.serverNameForNode(node); srv == _EMPTY_ {
  5410  			s.Noticef("JetStream cluster new remote metadata leader")
  5411  		} else if clst := js.srv.clusterNameForNode(node); clst == _EMPTY_ {
  5412  			s.Noticef("JetStream cluster new metadata leader: %s", srv)
  5413  		} else {
  5414  			s.Noticef("JetStream cluster new metadata leader: %s/%s", srv, clst)
  5415  		}
  5416  	}
  5417  
  5418  	js.mu.Lock()
  5419  	defer js.mu.Unlock()
  5420  
  5421  	if isLeader {
  5422  		js.startUpdatesSub()
  5423  	} else {
  5424  		js.stopUpdatesSub()
  5425  		// TODO(dlc) - stepdown.
  5426  	}
  5427  
  5428  	// If we have been signaled to check the streams, this is for a bug that left stream
  5429  	// assignments with no sync subject after an update and no way to sync/catchup outside of the RAFT layer.
  5430  	if isLeader && js.cluster.streamsCheck {
  5431  		cc := js.cluster
  5432  		for acc, asa := range cc.streams {
  5433  			for _, sa := range asa {
  5434  				if sa.Sync == _EMPTY_ {
  5435  					s.Warnf("Stream assigment corrupt for stream '%s > %s'", acc, sa.Config.Name)
  5436  					nsa := &streamAssignment{Group: sa.Group, Config: sa.Config, Subject: sa.Subject, Reply: sa.Reply, Client: sa.Client}
  5437  					nsa.Sync = syncSubjForStream()
  5438  					cc.meta.Propose(encodeUpdateStreamAssignment(nsa))
  5439  				}
  5440  			}
  5441  		}
  5442  		// Clear check.
  5443  		cc.streamsCheck = false
  5444  	}
  5445  }
  5446  
  5447  // Lock should be held.
  5448  func (cc *jetStreamCluster) remapStreamAssignment(sa *streamAssignment, removePeer string) bool {
  5449  	// Invoke placement algo passing RG peers that stay (existing) and the peer that is being removed (ignore)
  5450  	var retain, ignore []string
  5451  	for _, v := range sa.Group.Peers {
  5452  		if v == removePeer {
  5453  			ignore = append(ignore, v)
  5454  		} else {
  5455  			retain = append(retain, v)
  5456  		}
  5457  	}
  5458  
  5459  	newPeers, placementError := cc.selectPeerGroup(len(sa.Group.Peers), sa.Group.Cluster, sa.Config, retain, 0, ignore)
  5460  
  5461  	if placementError == nil {
  5462  		sa.Group.Peers = newPeers
  5463  		// Don't influence preferred leader.
  5464  		sa.Group.Preferred = _EMPTY_
  5465  		return true
  5466  	}
  5467  
  5468  	// If R1 just return to avoid bricking the stream.
  5469  	if sa.Group.node == nil || len(sa.Group.Peers) == 1 {
  5470  		return false
  5471  	}
  5472  
  5473  	// If we are here let's remove the peer at least, as long as we are R>1
  5474  	for i, peer := range sa.Group.Peers {
  5475  		if peer == removePeer {
  5476  			sa.Group.Peers[i] = sa.Group.Peers[len(sa.Group.Peers)-1]
  5477  			sa.Group.Peers = sa.Group.Peers[:len(sa.Group.Peers)-1]
  5478  			break
  5479  		}
  5480  	}
  5481  	return false
  5482  }
  5483  
  5484  type selectPeerError struct {
  5485  	excludeTag  bool
  5486  	offline     bool
  5487  	noStorage   bool
  5488  	uniqueTag   bool
  5489  	misc        bool
  5490  	noJsClust   bool
  5491  	noMatchTags map[string]struct{}
  5492  }
  5493  
  5494  func (e *selectPeerError) Error() string {
  5495  	b := strings.Builder{}
  5496  	writeBoolErrReason := func(hasErr bool, errMsg string) {
  5497  		if !hasErr {
  5498  			return
  5499  		}
  5500  		b.WriteString(", ")
  5501  		b.WriteString(errMsg)
  5502  	}
  5503  	b.WriteString("no suitable peers for placement")
  5504  	writeBoolErrReason(e.offline, "peer offline")
  5505  	writeBoolErrReason(e.excludeTag, "exclude tag set")
  5506  	writeBoolErrReason(e.noStorage, "insufficient storage")
  5507  	writeBoolErrReason(e.uniqueTag, "server tag not unique")
  5508  	writeBoolErrReason(e.misc, "miscellaneous issue")
  5509  	writeBoolErrReason(e.noJsClust, "jetstream not enabled in cluster")
  5510  	if len(e.noMatchTags) != 0 {
  5511  		b.WriteString(", tags not matched [")
  5512  		var firstTagWritten bool
  5513  		for tag := range e.noMatchTags {
  5514  			if firstTagWritten {
  5515  				b.WriteString(", ")
  5516  			}
  5517  			firstTagWritten = true
  5518  			b.WriteRune('\'')
  5519  			b.WriteString(tag)
  5520  			b.WriteRune('\'')
  5521  		}
  5522  		b.WriteString("]")
  5523  	}
  5524  	return b.String()
  5525  }
  5526  
  5527  func (e *selectPeerError) addMissingTag(t string) {
  5528  	if e.noMatchTags == nil {
  5529  		e.noMatchTags = map[string]struct{}{}
  5530  	}
  5531  	e.noMatchTags[t] = struct{}{}
  5532  }
  5533  
  5534  func (e *selectPeerError) accumulate(eAdd *selectPeerError) {
  5535  	if eAdd == nil {
  5536  		return
  5537  	}
  5538  	acc := func(val *bool, valAdd bool) {
  5539  		if valAdd {
  5540  			*val = valAdd
  5541  		}
  5542  	}
  5543  	acc(&e.offline, eAdd.offline)
  5544  	acc(&e.excludeTag, eAdd.excludeTag)
  5545  	acc(&e.noStorage, eAdd.noStorage)
  5546  	acc(&e.uniqueTag, eAdd.uniqueTag)
  5547  	acc(&e.misc, eAdd.misc)
  5548  	acc(&e.noJsClust, eAdd.noJsClust)
  5549  	for tag := range eAdd.noMatchTags {
  5550  		e.addMissingTag(tag)
  5551  	}
  5552  }
  5553  
  5554  // selectPeerGroup will select a group of peers to start a raft group.
  5555  // when peers exist already the unique tag prefix check for the replaceFirstExisting will be skipped
  5556  // js lock should be held.
  5557  func (cc *jetStreamCluster) selectPeerGroup(r int, cluster string, cfg *StreamConfig, existing []string, replaceFirstExisting int, ignore []string) ([]string, *selectPeerError) {
  5558  	if cluster == _EMPTY_ || cfg == nil {
  5559  		return nil, &selectPeerError{misc: true}
  5560  	}
  5561  
  5562  	var maxBytes uint64
  5563  	if cfg.MaxBytes > 0 {
  5564  		maxBytes = uint64(cfg.MaxBytes)
  5565  	}
  5566  
  5567  	// Check for tags.
  5568  	var tags []string
  5569  	if cfg.Placement != nil && len(cfg.Placement.Tags) > 0 {
  5570  		tags = cfg.Placement.Tags
  5571  	}
  5572  
  5573  	// Used for weighted sorting based on availability.
  5574  	type wn struct {
  5575  		id    string
  5576  		avail uint64
  5577  		ha    int
  5578  		ns    int
  5579  	}
  5580  
  5581  	var nodes []wn
  5582  	// peers is a randomized list
  5583  	s, peers := cc.s, cc.meta.Peers()
  5584  
  5585  	uniqueTagPrefix := s.getOpts().JetStreamUniqueTag
  5586  	if uniqueTagPrefix != _EMPTY_ {
  5587  		for _, tag := range tags {
  5588  			if strings.HasPrefix(tag, uniqueTagPrefix) {
  5589  				// disable uniqueness check if explicitly listed in tags
  5590  				uniqueTagPrefix = _EMPTY_
  5591  				break
  5592  			}
  5593  		}
  5594  	}
  5595  	var uniqueTags = make(map[string]*nodeInfo)
  5596  
  5597  	checkUniqueTag := func(ni *nodeInfo) (bool, *nodeInfo) {
  5598  		for _, t := range ni.tags {
  5599  			if strings.HasPrefix(t, uniqueTagPrefix) {
  5600  				if n, ok := uniqueTags[t]; !ok {
  5601  					uniqueTags[t] = ni
  5602  					return true, ni
  5603  				} else {
  5604  					return false, n
  5605  				}
  5606  			}
  5607  		}
  5608  		// default requires the unique prefix to be present
  5609  		return false, nil
  5610  	}
  5611  
  5612  	// Map existing.
  5613  	var ep map[string]struct{}
  5614  	if le := len(existing); le > 0 {
  5615  		if le >= r {
  5616  			return existing[:r], nil
  5617  		}
  5618  		ep = make(map[string]struct{})
  5619  		for i, p := range existing {
  5620  			ep[p] = struct{}{}
  5621  			if uniqueTagPrefix == _EMPTY_ {
  5622  				continue
  5623  			}
  5624  			si, ok := s.nodeToInfo.Load(p)
  5625  			if !ok || si == nil || i < replaceFirstExisting {
  5626  				continue
  5627  			}
  5628  			ni := si.(nodeInfo)
  5629  			// collect unique tags, but do not require them as this node is already part of the peerset
  5630  			checkUniqueTag(&ni)
  5631  		}
  5632  	}
  5633  
  5634  	// Map ignore
  5635  	var ip map[string]struct{}
  5636  	if li := len(ignore); li > 0 {
  5637  		ip = make(map[string]struct{})
  5638  		for _, p := range ignore {
  5639  			ip[p] = struct{}{}
  5640  		}
  5641  	}
  5642  
  5643  	// Grab the number of streams and HA assets currently assigned to each peer.
  5644  	// HAAssets under usage is async, so calculate here in realtime based on assignments.
  5645  	peerStreams := make(map[string]int, len(peers))
  5646  	peerHA := make(map[string]int, len(peers))
  5647  	for _, asa := range cc.streams {
  5648  		for _, sa := range asa {
  5649  			isHA := len(sa.Group.Peers) > 1
  5650  			for _, peer := range sa.Group.Peers {
  5651  				peerStreams[peer]++
  5652  				if isHA {
  5653  					peerHA[peer]++
  5654  				}
  5655  			}
  5656  		}
  5657  	}
  5658  
  5659  	maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets
  5660  
  5661  	// An error is a result of multiple individual placement decisions.
  5662  	// Which is why we keep taps on how often which one happened.
  5663  	err := selectPeerError{}
  5664  
  5665  	// Shuffle them up.
  5666  	rand.Shuffle(len(peers), func(i, j int) { peers[i], peers[j] = peers[j], peers[i] })
  5667  	for _, p := range peers {
  5668  		si, ok := s.nodeToInfo.Load(p.ID)
  5669  		if !ok || si == nil {
  5670  			err.misc = true
  5671  			continue
  5672  		}
  5673  		ni := si.(nodeInfo)
  5674  		// Only select from the designated named cluster.
  5675  		if ni.cluster != cluster {
  5676  			s.Debugf("Peer selection: discard %s@%s reason: not target cluster %s", ni.name, ni.cluster, cluster)
  5677  			continue
  5678  		}
  5679  
  5680  		// If we know its offline or we do not have config or err don't consider.
  5681  		if ni.offline || ni.cfg == nil || ni.stats == nil {
  5682  			s.Debugf("Peer selection: discard %s@%s reason: offline", ni.name, ni.cluster)
  5683  			err.offline = true
  5684  			continue
  5685  		}
  5686  
  5687  		// If ignore skip
  5688  		if _, ok := ip[p.ID]; ok {
  5689  			continue
  5690  		}
  5691  
  5692  		// If existing also skip, we will add back in to front of the list when done.
  5693  		if _, ok := ep[p.ID]; ok {
  5694  			continue
  5695  		}
  5696  
  5697  		if ni.tags.Contains(jsExcludePlacement) {
  5698  			s.Debugf("Peer selection: discard %s@%s tags: %v reason: %s present",
  5699  				ni.name, ni.cluster, ni.tags, jsExcludePlacement)
  5700  			err.excludeTag = true
  5701  			continue
  5702  		}
  5703  
  5704  		if len(tags) > 0 {
  5705  			matched := true
  5706  			for _, t := range tags {
  5707  				if !ni.tags.Contains(t) {
  5708  					matched = false
  5709  					s.Debugf("Peer selection: discard %s@%s tags: %v reason: mandatory tag %s not present",
  5710  						ni.name, ni.cluster, ni.tags, t)
  5711  					err.addMissingTag(t)
  5712  					break
  5713  				}
  5714  			}
  5715  			if !matched {
  5716  				continue
  5717  			}
  5718  		}
  5719  
  5720  		var available uint64
  5721  		if ni.stats != nil {
  5722  			switch cfg.Storage {
  5723  			case MemoryStorage:
  5724  				used := ni.stats.ReservedMemory
  5725  				if ni.stats.Memory > used {
  5726  					used = ni.stats.Memory
  5727  				}
  5728  				if ni.cfg.MaxMemory > int64(used) {
  5729  					available = uint64(ni.cfg.MaxMemory) - used
  5730  				}
  5731  			case FileStorage:
  5732  				used := ni.stats.ReservedStore
  5733  				if ni.stats.Store > used {
  5734  					used = ni.stats.Store
  5735  				}
  5736  				if ni.cfg.MaxStore > int64(used) {
  5737  					available = uint64(ni.cfg.MaxStore) - used
  5738  				}
  5739  			}
  5740  		}
  5741  
  5742  		// Otherwise check if we have enough room if maxBytes set.
  5743  		if maxBytes > 0 && maxBytes > available {
  5744  			s.Warnf("Peer selection: discard %s@%s (Max Bytes: %d) exceeds available %s storage of %d bytes",
  5745  				ni.name, ni.cluster, maxBytes, cfg.Storage.String(), available)
  5746  			err.noStorage = true
  5747  			continue
  5748  		}
  5749  		// HAAssets contain _meta_ which we want to ignore, hence > and not >=.
  5750  		if maxHaAssets > 0 && ni.stats != nil && ni.stats.HAAssets > maxHaAssets {
  5751  			s.Warnf("Peer selection: discard %s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d for stream placement",
  5752  				ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets)
  5753  			err.misc = true
  5754  			continue
  5755  		}
  5756  
  5757  		if uniqueTagPrefix != _EMPTY_ {
  5758  			if unique, owner := checkUniqueTag(&ni); !unique {
  5759  				if owner != nil {
  5760  					s.Debugf("Peer selection: discard %s@%s tags:%v reason: unique prefix %s owned by %s@%s",
  5761  						ni.name, ni.cluster, ni.tags, owner.name, owner.cluster)
  5762  				} else {
  5763  					s.Debugf("Peer selection: discard %s@%s tags:%v reason: unique prefix %s not present",
  5764  						ni.name, ni.cluster, ni.tags)
  5765  				}
  5766  				err.uniqueTag = true
  5767  				continue
  5768  			}
  5769  		}
  5770  		// Add to our list of potential nodes.
  5771  		nodes = append(nodes, wn{p.ID, available, peerHA[p.ID], peerStreams[p.ID]})
  5772  	}
  5773  
  5774  	// If we could not select enough peers, fail.
  5775  	if len(nodes) < (r - len(existing)) {
  5776  		s.Debugf("Peer selection: required %d nodes but found %d (cluster: %s replica: %d existing: %v/%d peers: %d result-peers: %d err: %+v)",
  5777  			(r - len(existing)), len(nodes), cluster, r, existing, replaceFirstExisting, len(peers), len(nodes), err)
  5778  		if len(peers) == 0 {
  5779  			err.noJsClust = true
  5780  		}
  5781  		return nil, &err
  5782  	}
  5783  	// Sort based on available from most to least, breaking ties by number of total streams assigned to the peer.
  5784  	sort.Slice(nodes, func(i, j int) bool {
  5785  		if nodes[i].avail == nodes[j].avail {
  5786  			return nodes[i].ns < nodes[j].ns
  5787  		}
  5788  		return nodes[i].avail > nodes[j].avail
  5789  	})
  5790  	// If we are placing a replicated stream, let's sort based on HAAssets, as that is more important to balance.
  5791  	if cfg.Replicas > 1 {
  5792  		sort.SliceStable(nodes, func(i, j int) bool { return nodes[i].ha < nodes[j].ha })
  5793  	}
  5794  
  5795  	var results []string
  5796  	if len(existing) > 0 {
  5797  		results = append(results, existing...)
  5798  		r -= len(existing)
  5799  	}
  5800  	for _, r := range nodes[:r] {
  5801  		results = append(results, r.id)
  5802  	}
  5803  	return results, nil
  5804  }
  5805  
  5806  func groupNameForStream(peers []string, storage StorageType) string {
  5807  	return groupName("S", peers, storage)
  5808  }
  5809  
  5810  func groupNameForConsumer(peers []string, storage StorageType) string {
  5811  	return groupName("C", peers, storage)
  5812  }
  5813  
  5814  func groupName(prefix string, peers []string, storage StorageType) string {
  5815  	gns := getHash(nuid.Next())
  5816  	return fmt.Sprintf("%s-R%d%s-%s", prefix, len(peers), storage.String()[:1], gns)
  5817  }
  5818  
  5819  // returns stream count for this tier as well as applicable reservation size (not including reservations for cfg)
  5820  // jetStream read lock should be held
  5821  func tieredStreamAndReservationCount(asa map[string]*streamAssignment, tier string, cfg *StreamConfig) (int, int64) {
  5822  	var numStreams int
  5823  	var reservation int64
  5824  	for _, sa := range asa {
  5825  		if tier == _EMPTY_ || isSameTier(sa.Config, cfg) {
  5826  			numStreams++
  5827  			if sa.Config.MaxBytes > 0 && sa.Config.Storage == cfg.Storage && sa.Config.Name != cfg.Name {
  5828  				// If tier is empty, all storage is flat and we should adjust for replicas.
  5829  				// Otherwise if tiered, storage replication already taken into consideration.
  5830  				if tier == _EMPTY_ && cfg.Replicas > 1 {
  5831  					reservation += sa.Config.MaxBytes * int64(cfg.Replicas)
  5832  				} else {
  5833  					reservation += sa.Config.MaxBytes
  5834  				}
  5835  			}
  5836  		}
  5837  	}
  5838  	return numStreams, reservation
  5839  }
  5840  
  5841  // createGroupForStream will create a group for assignment for the stream.
  5842  // Lock should be held.
  5843  func (js *jetStream) createGroupForStream(ci *ClientInfo, cfg *StreamConfig) (*raftGroup, *selectPeerError) {
  5844  	replicas := cfg.Replicas
  5845  	if replicas == 0 {
  5846  		replicas = 1
  5847  	}
  5848  
  5849  	// Default connected cluster from the request origin.
  5850  	cc, cluster := js.cluster, ci.Cluster
  5851  	// If specified, override the default.
  5852  	clusterDefined := cfg.Placement != nil && cfg.Placement.Cluster != _EMPTY_
  5853  	if clusterDefined {
  5854  		cluster = cfg.Placement.Cluster
  5855  	}
  5856  	clusters := []string{cluster}
  5857  	if !clusterDefined {
  5858  		clusters = append(clusters, ci.Alternates...)
  5859  	}
  5860  
  5861  	// Need to create a group here.
  5862  	errs := &selectPeerError{}
  5863  	for _, cn := range clusters {
  5864  		peers, err := cc.selectPeerGroup(replicas, cn, cfg, nil, 0, nil)
  5865  		if len(peers) < replicas {
  5866  			errs.accumulate(err)
  5867  			continue
  5868  		}
  5869  		return &raftGroup{Name: groupNameForStream(peers, cfg.Storage), Storage: cfg.Storage, Peers: peers, Cluster: cn}, nil
  5870  	}
  5871  	return nil, errs
  5872  }
  5873  
  5874  func (acc *Account) selectLimits(cfg *StreamConfig) (*JetStreamAccountLimits, string, *jsAccount, *ApiError) {
  5875  	// Grab our jetstream account info.
  5876  	acc.mu.RLock()
  5877  	jsa := acc.js
  5878  	acc.mu.RUnlock()
  5879  
  5880  	if jsa == nil {
  5881  		return nil, _EMPTY_, nil, NewJSNotEnabledForAccountError()
  5882  	}
  5883  
  5884  	jsa.usageMu.RLock()
  5885  	selectedLimits, tierName, ok := jsa.selectLimits(cfg)
  5886  	jsa.usageMu.RUnlock()
  5887  
  5888  	if !ok {
  5889  		return nil, _EMPTY_, nil, NewJSNoLimitsError()
  5890  	}
  5891  	return &selectedLimits, tierName, jsa, nil
  5892  }
  5893  
  5894  // Read lock needs to be held
  5895  func (js *jetStream) jsClusteredStreamLimitsCheck(acc *Account, cfg *StreamConfig) *ApiError {
  5896  	selectedLimits, tier, _, apiErr := acc.selectLimits(cfg)
  5897  	if apiErr != nil {
  5898  		return apiErr
  5899  	}
  5900  
  5901  	asa := js.cluster.streams[acc.Name]
  5902  	numStreams, reservations := tieredStreamAndReservationCount(asa, tier, cfg)
  5903  	// Check for inflight proposals...
  5904  	if cc := js.cluster; cc != nil && cc.inflight != nil {
  5905  		numStreams += len(cc.inflight[acc.Name])
  5906  	}
  5907  	if selectedLimits.MaxStreams > 0 && numStreams >= selectedLimits.MaxStreams {
  5908  		return NewJSMaximumStreamsLimitError()
  5909  	}
  5910  	// Check for account limits here before proposing.
  5911  	if err := js.checkAccountLimits(selectedLimits, cfg, reservations); err != nil {
  5912  		return NewJSStreamLimitsError(err, Unless(err))
  5913  	}
  5914  	return nil
  5915  }
  5916  
  5917  func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, config *StreamConfig) {
  5918  	js, cc := s.getJetStreamCluster()
  5919  	if js == nil || cc == nil {
  5920  		return
  5921  	}
  5922  
  5923  	var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}
  5924  
  5925  	ccfg, apiErr := s.checkStreamCfg(config, acc)
  5926  	if apiErr != nil {
  5927  		resp.Error = apiErr
  5928  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  5929  		return
  5930  	}
  5931  	cfg := &ccfg
  5932  
  5933  	// Now process the request and proposal.
  5934  	js.mu.Lock()
  5935  	defer js.mu.Unlock()
  5936  
  5937  	var self *streamAssignment
  5938  	var rg *raftGroup
  5939  
  5940  	// Capture if we have existing assignment first.
  5941  	if osa := js.streamAssignment(acc.Name, cfg.Name); osa != nil {
  5942  		if !reflect.DeepEqual(osa.Config, cfg) {
  5943  			resp.Error = NewJSStreamNameExistError()
  5944  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  5945  			return
  5946  		}
  5947  		// This is an equal assignment.
  5948  		self, rg = osa, osa.Group
  5949  	}
  5950  
  5951  	if cfg.Sealed {
  5952  		resp.Error = NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration for create can not be sealed"))
  5953  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  5954  		return
  5955  	}
  5956  
  5957  	// Check for subject collisions here.
  5958  	if cc.subjectsOverlap(acc.Name, cfg.Subjects, self) {
  5959  		resp.Error = NewJSStreamSubjectOverlapError()
  5960  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  5961  		return
  5962  	}
  5963  
  5964  	apiErr = js.jsClusteredStreamLimitsCheck(acc, cfg)
  5965  	// Check for stream limits here before proposing. These need to be tracked from meta layer, not jsa.
  5966  	if apiErr != nil {
  5967  		resp.Error = apiErr
  5968  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  5969  		return
  5970  	}
  5971  
  5972  	// Raft group selection and placement.
  5973  	if rg == nil {
  5974  		// Check inflight before proposing in case we have an existing inflight proposal.
  5975  		if cc.inflight == nil {
  5976  			cc.inflight = make(map[string]map[string]*raftGroup)
  5977  		}
  5978  		streams, ok := cc.inflight[acc.Name]
  5979  		if !ok {
  5980  			streams = make(map[string]*raftGroup)
  5981  			cc.inflight[acc.Name] = streams
  5982  		} else if existing, ok := streams[cfg.Name]; ok {
  5983  			// We have existing for same stream. Re-use same group.
  5984  			rg = existing
  5985  		}
  5986  	}
  5987  	// Create a new one here if needed.
  5988  	if rg == nil {
  5989  		nrg, err := js.createGroupForStream(ci, cfg)
  5990  		if err != nil {
  5991  			resp.Error = NewJSClusterNoPeersError(err)
  5992  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  5993  			return
  5994  		}
  5995  		rg = nrg
  5996  		// Pick a preferred leader.
  5997  		rg.setPreferred()
  5998  	}
  5999  
  6000  	// Sync subject for post snapshot sync.
  6001  	sa := &streamAssignment{Group: rg, Sync: syncSubjForStream(), Config: cfg, Subject: subject, Reply: reply, Client: ci, Created: time.Now().UTC()}
  6002  	if err := cc.meta.Propose(encodeAddStreamAssignment(sa)); err == nil {
  6003  		// On success, add this as an inflight proposal so we can apply limits
  6004  		// on concurrent create requests while this stream assignment has
  6005  		// possibly not been processed yet.
  6006  		if streams, ok := cc.inflight[acc.Name]; ok {
  6007  			streams[cfg.Name] = rg
  6008  		}
  6009  	}
  6010  }
  6011  
  6012  var (
  6013  	errReqTimeout = errors.New("timeout while waiting for response")
  6014  	errReqSrvExit = errors.New("server shutdown while waiting for response")
  6015  )
  6016  
  6017  // blocking utility call to perform requests on the system account
  6018  // returns (synchronized) v or error
  6019  func sysRequest[T any](s *Server, subjFormat string, args ...interface{}) (*T, error) {
  6020  	isubj := fmt.Sprintf(subjFormat, args...)
  6021  
  6022  	s.mu.Lock()
  6023  	inbox := s.newRespInbox()
  6024  	results := make(chan *T, 1)
  6025  	s.sys.replies[inbox] = func(_ *subscription, _ *client, _ *Account, _, _ string, msg []byte) {
  6026  		var v T
  6027  		if err := json.Unmarshal(msg, &v); err != nil {
  6028  			s.Warnf("Error unmarshalling response for request '%s':%v", isubj, err)
  6029  			return
  6030  		}
  6031  		select {
  6032  		case results <- &v:
  6033  		default:
  6034  			s.Warnf("Failed placing request response on internal channel")
  6035  		}
  6036  	}
  6037  	s.mu.Unlock()
  6038  
  6039  	s.sendInternalMsgLocked(isubj, inbox, nil, nil)
  6040  
  6041  	defer func() {
  6042  		s.mu.Lock()
  6043  		defer s.mu.Unlock()
  6044  		if s.sys != nil && s.sys.replies != nil {
  6045  			delete(s.sys.replies, inbox)
  6046  		}
  6047  	}()
  6048  
  6049  	ttl := time.NewTimer(2 * time.Second)
  6050  	defer ttl.Stop()
  6051  
  6052  	select {
  6053  	case <-s.quitCh:
  6054  		return nil, errReqSrvExit
  6055  	case <-ttl.C:
  6056  		return nil, errReqTimeout
  6057  	case data := <-results:
  6058  		return data, nil
  6059  	}
  6060  }
  6061  
  6062  func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, cfg *StreamConfig, peerSet []string) {
  6063  	js, cc := s.getJetStreamCluster()
  6064  	if js == nil || cc == nil {
  6065  		return
  6066  	}
  6067  
  6068  	// Now process the request and proposal.
  6069  	js.mu.Lock()
  6070  	defer js.mu.Unlock()
  6071  	meta := cc.meta
  6072  	if meta == nil {
  6073  		return
  6074  	}
  6075  
  6076  	var resp = JSApiStreamUpdateResponse{ApiResponse: ApiResponse{Type: JSApiStreamUpdateResponseType}}
  6077  
  6078  	osa := js.streamAssignment(acc.Name, cfg.Name)
  6079  
  6080  	if osa == nil {
  6081  		resp.Error = NewJSStreamNotFoundError()
  6082  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6083  		return
  6084  	}
  6085  	var newCfg *StreamConfig
  6086  	if jsa := js.accounts[acc.Name]; jsa != nil {
  6087  		js.mu.Unlock()
  6088  		ncfg, err := jsa.configUpdateCheck(osa.Config, cfg, s)
  6089  		js.mu.Lock()
  6090  		if err != nil {
  6091  			resp.Error = NewJSStreamUpdateError(err, Unless(err))
  6092  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6093  			return
  6094  		} else {
  6095  			newCfg = ncfg
  6096  		}
  6097  	} else {
  6098  		resp.Error = NewJSNotEnabledForAccountError()
  6099  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6100  		return
  6101  	}
  6102  	// Check for mirror changes which are not allowed.
  6103  	if !reflect.DeepEqual(newCfg.Mirror, osa.Config.Mirror) {
  6104  		resp.Error = NewJSStreamMirrorNotUpdatableError()
  6105  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6106  		return
  6107  	}
  6108  
  6109  	// Check for subject collisions here.
  6110  	if cc.subjectsOverlap(acc.Name, cfg.Subjects, osa) {
  6111  		resp.Error = NewJSStreamSubjectOverlapError()
  6112  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6113  		return
  6114  	}
  6115  
  6116  	// Make copy so to not change original.
  6117  	rg := osa.copyGroup().Group
  6118  
  6119  	// Check for a move request.
  6120  	var isMoveRequest, isMoveCancel bool
  6121  	if lPeerSet := len(peerSet); lPeerSet > 0 {
  6122  		isMoveRequest = true
  6123  		// check if this is a cancellation
  6124  		if lPeerSet == osa.Config.Replicas && lPeerSet <= len(rg.Peers) {
  6125  			isMoveCancel = true
  6126  			// can only be a cancellation if the peer sets overlap as expected
  6127  			for i := 0; i < lPeerSet; i++ {
  6128  				if peerSet[i] != rg.Peers[i] {
  6129  					isMoveCancel = false
  6130  					break
  6131  				}
  6132  			}
  6133  		}
  6134  	} else {
  6135  		isMoveRequest = newCfg.Placement != nil && !reflect.DeepEqual(osa.Config.Placement, newCfg.Placement)
  6136  	}
  6137  
  6138  	// Check for replica changes.
  6139  	isReplicaChange := newCfg.Replicas != osa.Config.Replicas
  6140  
  6141  	// We stage consumer updates and do them after the stream update.
  6142  	var consumers []*consumerAssignment
  6143  
  6144  	// Check if this is a move request, but no cancellation, and we are already moving this stream.
  6145  	if isMoveRequest && !isMoveCancel && osa.Config.Replicas != len(rg.Peers) {
  6146  		// obtain stats to include in error message
  6147  		msg := _EMPTY_
  6148  		if s.allPeersOffline(rg) {
  6149  			msg = fmt.Sprintf("all %d peers offline", len(rg.Peers))
  6150  		} else {
  6151  			// Need to release js lock.
  6152  			js.mu.Unlock()
  6153  			if si, err := sysRequest[StreamInfo](s, clusterStreamInfoT, ci.serviceAccount(), cfg.Name); err != nil {
  6154  				msg = fmt.Sprintf("error retrieving info: %s", err.Error())
  6155  			} else if si != nil {
  6156  				currentCount := 0
  6157  				if si.Cluster.Leader != _EMPTY_ {
  6158  					currentCount++
  6159  				}
  6160  				combinedLag := uint64(0)
  6161  				for _, r := range si.Cluster.Replicas {
  6162  					if r.Current {
  6163  						currentCount++
  6164  					}
  6165  					combinedLag += r.Lag
  6166  				}
  6167  				msg = fmt.Sprintf("total peers: %d, current peers: %d, combined lag: %d",
  6168  					len(rg.Peers), currentCount, combinedLag)
  6169  			}
  6170  			// Re-acquire here.
  6171  			js.mu.Lock()
  6172  		}
  6173  		resp.Error = NewJSStreamMoveInProgressError(msg)
  6174  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6175  		return
  6176  	}
  6177  
  6178  	// Can not move and scale at same time.
  6179  	if isMoveRequest && isReplicaChange {
  6180  		resp.Error = NewJSStreamMoveAndScaleError()
  6181  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6182  		return
  6183  	}
  6184  
  6185  	if isReplicaChange {
  6186  		// We are adding new peers here.
  6187  		if newCfg.Replicas > len(rg.Peers) {
  6188  			// Check that we have the allocation available.
  6189  			if err := js.jsClusteredStreamLimitsCheck(acc, newCfg); err != nil {
  6190  				resp.Error = err
  6191  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6192  				return
  6193  			}
  6194  			// Check if we do not have a cluster assigned, and if we do not make sure we
  6195  			// try to pick one. This could happen with older streams that were assigned by
  6196  			// previous servers.
  6197  			if rg.Cluster == _EMPTY_ {
  6198  				// Prefer placement directrives if we have them.
  6199  				if newCfg.Placement != nil && newCfg.Placement.Cluster != _EMPTY_ {
  6200  					rg.Cluster = newCfg.Placement.Cluster
  6201  				} else {
  6202  					// Fall back to the cluster assignment from the client.
  6203  					rg.Cluster = ci.Cluster
  6204  				}
  6205  			}
  6206  			peers, err := cc.selectPeerGroup(newCfg.Replicas, rg.Cluster, newCfg, rg.Peers, 0, nil)
  6207  			if err != nil {
  6208  				resp.Error = NewJSClusterNoPeersError(err)
  6209  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6210  				return
  6211  			}
  6212  			// Single nodes are not recorded by the NRG layer so we can rename.
  6213  			if len(peers) == 1 {
  6214  				rg.Name = groupNameForStream(peers, rg.Storage)
  6215  			} else if len(rg.Peers) == 1 {
  6216  				// This is scale up from being a singelton, set preferred to that singelton.
  6217  				rg.Preferred = rg.Peers[0]
  6218  			}
  6219  			rg.Peers = peers
  6220  		} else {
  6221  			// We are deleting nodes here. We want to do our best to preserve the current leader.
  6222  			// We have support now from above that guarantees we are in our own Go routine, so can
  6223  			// ask for stream info from the stream leader to make sure we keep the leader in the new list.
  6224  			var curLeader string
  6225  			if !s.allPeersOffline(rg) {
  6226  				// Need to release js lock.
  6227  				js.mu.Unlock()
  6228  				if si, err := sysRequest[StreamInfo](s, clusterStreamInfoT, ci.serviceAccount(), cfg.Name); err != nil {
  6229  					s.Warnf("Did not receive stream info results for '%s > %s' due to: %s", acc, cfg.Name, err)
  6230  				} else if si != nil {
  6231  					if cl := si.Cluster; cl != nil && cl.Leader != _EMPTY_ {
  6232  						curLeader = getHash(cl.Leader)
  6233  					}
  6234  				}
  6235  				// Re-acquire here.
  6236  				js.mu.Lock()
  6237  			}
  6238  			// If we identified a leader make sure its part of the new group.
  6239  			selected := make([]string, 0, newCfg.Replicas)
  6240  
  6241  			if curLeader != _EMPTY_ {
  6242  				selected = append(selected, curLeader)
  6243  			}
  6244  			for _, peer := range rg.Peers {
  6245  				if len(selected) == newCfg.Replicas {
  6246  					break
  6247  				}
  6248  				if peer == curLeader {
  6249  					continue
  6250  				}
  6251  				if si, ok := s.nodeToInfo.Load(peer); ok && si != nil {
  6252  					if si.(nodeInfo).offline {
  6253  						continue
  6254  					}
  6255  					selected = append(selected, peer)
  6256  				}
  6257  			}
  6258  			rg.Peers = selected
  6259  		}
  6260  
  6261  		// Need to remap any consumers.
  6262  		for _, ca := range osa.consumers {
  6263  			// Ephemerals are R=1, so only auto-remap durables, or R>1, unless stream is interest or workqueue policy.
  6264  			numPeers := len(ca.Group.Peers)
  6265  			if ca.Config.Durable != _EMPTY_ || numPeers > 1 || cfg.Retention != LimitsPolicy {
  6266  				cca := ca.copyGroup()
  6267  				// Adjust preferred as needed.
  6268  				if numPeers == 1 && len(rg.Peers) > 1 {
  6269  					cca.Group.Preferred = ca.Group.Peers[0]
  6270  				} else {
  6271  					cca.Group.Preferred = _EMPTY_
  6272  				}
  6273  				// Assign new peers.
  6274  				cca.Group.Peers = rg.Peers
  6275  				// We can not propose here before the stream itself so we collect them.
  6276  				consumers = append(consumers, cca)
  6277  			}
  6278  		}
  6279  	} else if isMoveRequest {
  6280  		if len(peerSet) == 0 {
  6281  			nrg, err := js.createGroupForStream(ci, newCfg)
  6282  			if err != nil {
  6283  				resp.Error = NewJSClusterNoPeersError(err)
  6284  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6285  				return
  6286  			}
  6287  			// filter peers present in both sets
  6288  			for _, peer := range rg.Peers {
  6289  				found := false
  6290  				for _, newPeer := range nrg.Peers {
  6291  					if peer == newPeer {
  6292  						found = true
  6293  						break
  6294  					}
  6295  				}
  6296  				if !found {
  6297  					peerSet = append(peerSet, peer)
  6298  				}
  6299  			}
  6300  			peerSet = append(peerSet, nrg.Peers...)
  6301  		}
  6302  		if len(rg.Peers) == 1 {
  6303  			rg.Preferred = peerSet[0]
  6304  		}
  6305  		rg.Peers = peerSet
  6306  
  6307  		for _, ca := range osa.consumers {
  6308  			cca := ca.copyGroup()
  6309  			r := cca.Config.replicas(osa.Config)
  6310  			// shuffle part of cluster peer set we will be keeping
  6311  			randPeerSet := copyStrings(peerSet[len(peerSet)-newCfg.Replicas:])
  6312  			rand.Shuffle(newCfg.Replicas, func(i, j int) { randPeerSet[i], randPeerSet[j] = randPeerSet[j], randPeerSet[i] })
  6313  			// move overlapping peers at the end of randPeerSet and keep a tally of non overlapping peers
  6314  			dropPeerSet := make([]string, 0, len(cca.Group.Peers))
  6315  			for _, p := range cca.Group.Peers {
  6316  				found := false
  6317  				for i, rp := range randPeerSet {
  6318  					if p == rp {
  6319  						randPeerSet[i] = randPeerSet[newCfg.Replicas-1]
  6320  						randPeerSet[newCfg.Replicas-1] = p
  6321  						found = true
  6322  						break
  6323  					}
  6324  				}
  6325  				if !found {
  6326  					dropPeerSet = append(dropPeerSet, p)
  6327  				}
  6328  			}
  6329  			cPeerSet := randPeerSet[newCfg.Replicas-r:]
  6330  			// In case of a set or cancel simply assign
  6331  			if len(peerSet) == newCfg.Replicas {
  6332  				cca.Group.Peers = cPeerSet
  6333  			} else {
  6334  				cca.Group.Peers = append(dropPeerSet, cPeerSet...)
  6335  			}
  6336  			// make sure it overlaps with peers and remove if not
  6337  			if cca.Group.Preferred != _EMPTY_ {
  6338  				found := false
  6339  				for _, p := range cca.Group.Peers {
  6340  					if p == cca.Group.Preferred {
  6341  						found = true
  6342  						break
  6343  					}
  6344  				}
  6345  				if !found {
  6346  					cca.Group.Preferred = _EMPTY_
  6347  				}
  6348  			}
  6349  			// We can not propose here before the stream itself so we collect them.
  6350  			consumers = append(consumers, cca)
  6351  		}
  6352  	} else {
  6353  		// All other updates make sure no preferred is set.
  6354  		rg.Preferred = _EMPTY_
  6355  	}
  6356  
  6357  	sa := &streamAssignment{Group: rg, Sync: osa.Sync, Created: osa.Created, Config: newCfg, Subject: subject, Reply: reply, Client: ci}
  6358  	meta.Propose(encodeUpdateStreamAssignment(sa))
  6359  
  6360  	// Process any staged consumers.
  6361  	for _, ca := range consumers {
  6362  		meta.Propose(encodeAddConsumerAssignment(ca))
  6363  	}
  6364  }
  6365  
  6366  func (s *Server) jsClusteredStreamDeleteRequest(ci *ClientInfo, acc *Account, stream, subject, reply string, rmsg []byte) {
  6367  	js, cc := s.getJetStreamCluster()
  6368  	if js == nil || cc == nil {
  6369  		return
  6370  	}
  6371  
  6372  	js.mu.Lock()
  6373  	defer js.mu.Unlock()
  6374  
  6375  	if cc.meta == nil {
  6376  		return
  6377  	}
  6378  
  6379  	osa := js.streamAssignment(acc.Name, stream)
  6380  	if osa == nil {
  6381  		var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}}
  6382  		resp.Error = NewJSStreamNotFoundError()
  6383  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6384  		return
  6385  	}
  6386  
  6387  	sa := &streamAssignment{Group: osa.Group, Config: osa.Config, Subject: subject, Reply: reply, Client: ci}
  6388  	cc.meta.Propose(encodeDeleteStreamAssignment(sa))
  6389  }
  6390  
  6391  // Process a clustered purge request.
  6392  func (s *Server) jsClusteredStreamPurgeRequest(
  6393  	ci *ClientInfo,
  6394  	acc *Account,
  6395  	mset *stream,
  6396  	stream, subject, reply string,
  6397  	rmsg []byte,
  6398  	preq *JSApiStreamPurgeRequest,
  6399  ) {
  6400  	js, cc := s.getJetStreamCluster()
  6401  	if js == nil || cc == nil {
  6402  		return
  6403  	}
  6404  
  6405  	js.mu.Lock()
  6406  	sa := js.streamAssignment(acc.Name, stream)
  6407  	if sa == nil {
  6408  		resp := JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}}
  6409  		resp.Error = NewJSStreamNotFoundError()
  6410  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6411  		js.mu.Unlock()
  6412  		return
  6413  	}
  6414  
  6415  	if n := sa.Group.node; n != nil {
  6416  		sp := &streamPurge{Stream: stream, LastSeq: mset.state().LastSeq, Subject: subject, Reply: reply, Client: ci, Request: preq}
  6417  		n.Propose(encodeStreamPurge(sp))
  6418  		js.mu.Unlock()
  6419  		return
  6420  	}
  6421  	js.mu.Unlock()
  6422  
  6423  	if mset == nil {
  6424  		return
  6425  	}
  6426  
  6427  	var resp = JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}}
  6428  	purged, err := mset.purge(preq)
  6429  	if err != nil {
  6430  		resp.Error = NewJSStreamGeneralError(err, Unless(err))
  6431  	} else {
  6432  		resp.Purged = purged
  6433  		resp.Success = true
  6434  	}
  6435  	s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6436  }
  6437  
  6438  func (s *Server) jsClusteredStreamRestoreRequest(
  6439  	ci *ClientInfo,
  6440  	acc *Account,
  6441  	req *JSApiStreamRestoreRequest,
  6442  	stream, subject, reply string, rmsg []byte) {
  6443  
  6444  	js, cc := s.getJetStreamCluster()
  6445  	if js == nil || cc == nil {
  6446  		return
  6447  	}
  6448  
  6449  	js.mu.Lock()
  6450  	defer js.mu.Unlock()
  6451  
  6452  	if cc.meta == nil {
  6453  		return
  6454  	}
  6455  
  6456  	cfg := &req.Config
  6457  	resp := JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}}
  6458  
  6459  	if err := js.jsClusteredStreamLimitsCheck(acc, cfg); err != nil {
  6460  		resp.Error = err
  6461  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6462  		return
  6463  	}
  6464  
  6465  	if sa := js.streamAssignment(ci.serviceAccount(), cfg.Name); sa != nil {
  6466  		resp.Error = NewJSStreamNameExistRestoreFailedError()
  6467  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6468  		return
  6469  	}
  6470  
  6471  	// Raft group selection and placement.
  6472  	rg, err := js.createGroupForStream(ci, cfg)
  6473  	if err != nil {
  6474  		resp.Error = NewJSClusterNoPeersError(err)
  6475  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6476  		return
  6477  	}
  6478  	// Pick a preferred leader.
  6479  	rg.setPreferred()
  6480  	sa := &streamAssignment{Group: rg, Sync: syncSubjForStream(), Config: cfg, Subject: subject, Reply: reply, Client: ci, Created: time.Now().UTC()}
  6481  	// Now add in our restore state and pre-select a peer to handle the actual receipt of the snapshot.
  6482  	sa.Restore = &req.State
  6483  	cc.meta.Propose(encodeAddStreamAssignment(sa))
  6484  }
  6485  
  6486  // Determine if all peers for this group are offline.
  6487  func (s *Server) allPeersOffline(rg *raftGroup) bool {
  6488  	if rg == nil {
  6489  		return false
  6490  	}
  6491  	// Check to see if this stream has any servers online to respond.
  6492  	for _, peer := range rg.Peers {
  6493  		if si, ok := s.nodeToInfo.Load(peer); ok && si != nil {
  6494  			if !si.(nodeInfo).offline {
  6495  				return false
  6496  			}
  6497  		}
  6498  	}
  6499  	return true
  6500  }
  6501  
  6502  // This will do a scatter and gather operation for all streams for this account. This is only called from metadata leader.
  6503  // This will be running in a separate Go routine.
  6504  func (s *Server) jsClusteredStreamListRequest(acc *Account, ci *ClientInfo, filter string, offset int, subject, reply string, rmsg []byte) {
  6505  	defer s.grWG.Done()
  6506  
  6507  	js, cc := s.getJetStreamCluster()
  6508  	if js == nil || cc == nil {
  6509  		return
  6510  	}
  6511  
  6512  	js.mu.RLock()
  6513  
  6514  	var streams []*streamAssignment
  6515  	for _, sa := range cc.streams[acc.Name] {
  6516  		if IsNatsErr(sa.err, JSClusterNotAssignedErr) {
  6517  			continue
  6518  		}
  6519  
  6520  		if filter != _EMPTY_ {
  6521  			// These could not have subjects auto-filled in since they are raw and unprocessed.
  6522  			if len(sa.Config.Subjects) == 0 {
  6523  				if SubjectsCollide(filter, sa.Config.Name) {
  6524  					streams = append(streams, sa)
  6525  				}
  6526  			} else {
  6527  				for _, subj := range sa.Config.Subjects {
  6528  					if SubjectsCollide(filter, subj) {
  6529  						streams = append(streams, sa)
  6530  						break
  6531  					}
  6532  				}
  6533  			}
  6534  		} else {
  6535  			streams = append(streams, sa)
  6536  		}
  6537  	}
  6538  
  6539  	// Needs to be sorted for offsets etc.
  6540  	if len(streams) > 1 {
  6541  		sort.Slice(streams, func(i, j int) bool {
  6542  			return strings.Compare(streams[i].Config.Name, streams[j].Config.Name) < 0
  6543  		})
  6544  	}
  6545  
  6546  	scnt := len(streams)
  6547  	if offset > scnt {
  6548  		offset = scnt
  6549  	}
  6550  	if offset > 0 {
  6551  		streams = streams[offset:]
  6552  	}
  6553  	if len(streams) > JSApiListLimit {
  6554  		streams = streams[:JSApiListLimit]
  6555  	}
  6556  
  6557  	var resp = JSApiStreamListResponse{
  6558  		ApiResponse: ApiResponse{Type: JSApiStreamListResponseType},
  6559  		Streams:     make([]*StreamInfo, 0, len(streams)),
  6560  	}
  6561  
  6562  	js.mu.RUnlock()
  6563  
  6564  	if len(streams) == 0 {
  6565  		resp.Limit = JSApiListLimit
  6566  		resp.Offset = offset
  6567  		s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6568  		return
  6569  	}
  6570  
  6571  	// Create an inbox for our responses and send out our requests.
  6572  	s.mu.Lock()
  6573  	inbox := s.newRespInbox()
  6574  	rc := make(chan *StreamInfo, len(streams))
  6575  
  6576  	// Store our handler.
  6577  	s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) {
  6578  		var si StreamInfo
  6579  		if err := json.Unmarshal(msg, &si); err != nil {
  6580  			s.Warnf("Error unmarshalling clustered stream info response:%v", err)
  6581  			return
  6582  		}
  6583  		select {
  6584  		case rc <- &si:
  6585  		default:
  6586  			s.Warnf("Failed placing remote stream info result on internal channel")
  6587  		}
  6588  	}
  6589  	s.mu.Unlock()
  6590  
  6591  	// Cleanup after.
  6592  	defer func() {
  6593  		s.mu.Lock()
  6594  		if s.sys != nil && s.sys.replies != nil {
  6595  			delete(s.sys.replies, inbox)
  6596  		}
  6597  		s.mu.Unlock()
  6598  	}()
  6599  
  6600  	var missingNames []string
  6601  	sent := map[string]int{}
  6602  
  6603  	// Send out our requests here.
  6604  	js.mu.RLock()
  6605  	for _, sa := range streams {
  6606  		if s.allPeersOffline(sa.Group) {
  6607  			// Place offline onto our results by hand here.
  6608  			si := &StreamInfo{
  6609  				Config:    *sa.Config,
  6610  				Created:   sa.Created,
  6611  				Cluster:   js.offlineClusterInfo(sa.Group),
  6612  				TimeStamp: time.Now().UTC(),
  6613  			}
  6614  			resp.Streams = append(resp.Streams, si)
  6615  			missingNames = append(missingNames, sa.Config.Name)
  6616  		} else {
  6617  			isubj := fmt.Sprintf(clusterStreamInfoT, sa.Client.serviceAccount(), sa.Config.Name)
  6618  			s.sendInternalMsgLocked(isubj, inbox, nil, nil)
  6619  			sent[sa.Config.Name] = len(sa.consumers)
  6620  		}
  6621  	}
  6622  	// Don't hold lock.
  6623  	js.mu.RUnlock()
  6624  
  6625  	const timeout = 4 * time.Second
  6626  	notActive := time.NewTimer(timeout)
  6627  	defer notActive.Stop()
  6628  
  6629  LOOP:
  6630  	for len(sent) > 0 {
  6631  		select {
  6632  		case <-s.quitCh:
  6633  			return
  6634  		case <-notActive.C:
  6635  			s.Warnf("Did not receive all stream info results for %q", acc)
  6636  			for sName := range sent {
  6637  				missingNames = append(missingNames, sName)
  6638  			}
  6639  			break LOOP
  6640  		case si := <-rc:
  6641  			consCount := sent[si.Config.Name]
  6642  			if consCount > 0 {
  6643  				si.State.Consumers = consCount
  6644  			}
  6645  			delete(sent, si.Config.Name)
  6646  			resp.Streams = append(resp.Streams, si)
  6647  			// Check to see if we are done.
  6648  			if len(resp.Streams) == len(streams) {
  6649  				break LOOP
  6650  			}
  6651  		}
  6652  	}
  6653  
  6654  	// Needs to be sorted as well.
  6655  	if len(resp.Streams) > 1 {
  6656  		sort.Slice(resp.Streams, func(i, j int) bool {
  6657  			return strings.Compare(resp.Streams[i].Config.Name, resp.Streams[j].Config.Name) < 0
  6658  		})
  6659  	}
  6660  
  6661  	resp.Total = scnt
  6662  	resp.Limit = JSApiListLimit
  6663  	resp.Offset = offset
  6664  	resp.Missing = missingNames
  6665  	s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6666  }
  6667  
  6668  // This will do a scatter and gather operation for all consumers for this stream and account.
  6669  // This will be running in a separate Go routine.
  6670  func (s *Server) jsClusteredConsumerListRequest(acc *Account, ci *ClientInfo, offset int, stream, subject, reply string, rmsg []byte) {
  6671  	defer s.grWG.Done()
  6672  
  6673  	js, cc := s.getJetStreamCluster()
  6674  	if js == nil || cc == nil {
  6675  		return
  6676  	}
  6677  
  6678  	js.mu.RLock()
  6679  
  6680  	var consumers []*consumerAssignment
  6681  	if sas := cc.streams[acc.Name]; sas != nil {
  6682  		if sa := sas[stream]; sa != nil {
  6683  			// Copy over since we need to sort etc.
  6684  			for _, ca := range sa.consumers {
  6685  				consumers = append(consumers, ca)
  6686  			}
  6687  		}
  6688  	}
  6689  	// Needs to be sorted.
  6690  	if len(consumers) > 1 {
  6691  		sort.Slice(consumers, func(i, j int) bool {
  6692  			return strings.Compare(consumers[i].Name, consumers[j].Name) < 0
  6693  		})
  6694  	}
  6695  
  6696  	ocnt := len(consumers)
  6697  	if offset > ocnt {
  6698  		offset = ocnt
  6699  	}
  6700  	if offset > 0 {
  6701  		consumers = consumers[offset:]
  6702  	}
  6703  	if len(consumers) > JSApiListLimit {
  6704  		consumers = consumers[:JSApiListLimit]
  6705  	}
  6706  
  6707  	// Send out our requests here.
  6708  	var resp = JSApiConsumerListResponse{
  6709  		ApiResponse: ApiResponse{Type: JSApiConsumerListResponseType},
  6710  		Consumers:   []*ConsumerInfo{},
  6711  	}
  6712  
  6713  	js.mu.RUnlock()
  6714  
  6715  	if len(consumers) == 0 {
  6716  		resp.Limit = JSApiListLimit
  6717  		resp.Offset = offset
  6718  		s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6719  		return
  6720  	}
  6721  
  6722  	// Create an inbox for our responses and send out requests.
  6723  	s.mu.Lock()
  6724  	inbox := s.newRespInbox()
  6725  	rc := make(chan *ConsumerInfo, len(consumers))
  6726  
  6727  	// Store our handler.
  6728  	s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) {
  6729  		var ci ConsumerInfo
  6730  		if err := json.Unmarshal(msg, &ci); err != nil {
  6731  			s.Warnf("Error unmarshaling clustered consumer info response:%v", err)
  6732  			return
  6733  		}
  6734  		select {
  6735  		case rc <- &ci:
  6736  		default:
  6737  			s.Warnf("Failed placing consumer info result on internal chan")
  6738  		}
  6739  	}
  6740  	s.mu.Unlock()
  6741  
  6742  	// Cleanup after.
  6743  	defer func() {
  6744  		s.mu.Lock()
  6745  		if s.sys != nil && s.sys.replies != nil {
  6746  			delete(s.sys.replies, inbox)
  6747  		}
  6748  		s.mu.Unlock()
  6749  	}()
  6750  
  6751  	var missingNames []string
  6752  	sent := map[string]struct{}{}
  6753  
  6754  	// Send out our requests here.
  6755  	js.mu.RLock()
  6756  	for _, ca := range consumers {
  6757  		if s.allPeersOffline(ca.Group) {
  6758  			// Place offline onto our results by hand here.
  6759  			ci := &ConsumerInfo{
  6760  				Config:    ca.Config,
  6761  				Created:   ca.Created,
  6762  				Cluster:   js.offlineClusterInfo(ca.Group),
  6763  				TimeStamp: time.Now().UTC(),
  6764  			}
  6765  			resp.Consumers = append(resp.Consumers, ci)
  6766  			missingNames = append(missingNames, ca.Name)
  6767  		} else {
  6768  			isubj := fmt.Sprintf(clusterConsumerInfoT, ca.Client.serviceAccount(), stream, ca.Name)
  6769  			s.sendInternalMsgLocked(isubj, inbox, nil, nil)
  6770  			sent[ca.Name] = struct{}{}
  6771  		}
  6772  	}
  6773  	// Don't hold lock.
  6774  	js.mu.RUnlock()
  6775  
  6776  	const timeout = 4 * time.Second
  6777  	notActive := time.NewTimer(timeout)
  6778  	defer notActive.Stop()
  6779  
  6780  LOOP:
  6781  	for len(sent) > 0 {
  6782  		select {
  6783  		case <-s.quitCh:
  6784  			return
  6785  		case <-notActive.C:
  6786  			s.Warnf("Did not receive all consumer info results for '%s > %s'", acc, stream)
  6787  			for cName := range sent {
  6788  				missingNames = append(missingNames, cName)
  6789  			}
  6790  			break LOOP
  6791  		case ci := <-rc:
  6792  			delete(sent, ci.Name)
  6793  			resp.Consumers = append(resp.Consumers, ci)
  6794  			// Check to see if we are done.
  6795  			if len(resp.Consumers) == len(consumers) {
  6796  				break LOOP
  6797  			}
  6798  		}
  6799  	}
  6800  
  6801  	// Needs to be sorted as well.
  6802  	if len(resp.Consumers) > 1 {
  6803  		sort.Slice(resp.Consumers, func(i, j int) bool {
  6804  			return strings.Compare(resp.Consumers[i].Name, resp.Consumers[j].Name) < 0
  6805  		})
  6806  	}
  6807  
  6808  	resp.Total = ocnt
  6809  	resp.Limit = JSApiListLimit
  6810  	resp.Offset = offset
  6811  	resp.Missing = missingNames
  6812  	s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6813  }
  6814  
  6815  func encodeStreamPurge(sp *streamPurge) []byte {
  6816  	var bb bytes.Buffer
  6817  	bb.WriteByte(byte(purgeStreamOp))
  6818  	json.NewEncoder(&bb).Encode(sp)
  6819  	return bb.Bytes()
  6820  }
  6821  
  6822  func decodeStreamPurge(buf []byte) (*streamPurge, error) {
  6823  	var sp streamPurge
  6824  	err := json.Unmarshal(buf, &sp)
  6825  	return &sp, err
  6826  }
  6827  
  6828  func (s *Server) jsClusteredConsumerDeleteRequest(ci *ClientInfo, acc *Account, stream, consumer, subject, reply string, rmsg []byte) {
  6829  	js, cc := s.getJetStreamCluster()
  6830  	if js == nil || cc == nil {
  6831  		return
  6832  	}
  6833  
  6834  	js.mu.Lock()
  6835  	defer js.mu.Unlock()
  6836  
  6837  	if cc.meta == nil {
  6838  		return
  6839  	}
  6840  
  6841  	var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}}
  6842  
  6843  	sa := js.streamAssignment(acc.Name, stream)
  6844  	if sa == nil {
  6845  		resp.Error = NewJSStreamNotFoundError()
  6846  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6847  		return
  6848  
  6849  	}
  6850  	if sa.consumers == nil {
  6851  		resp.Error = NewJSConsumerNotFoundError()
  6852  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6853  		return
  6854  	}
  6855  	oca := sa.consumers[consumer]
  6856  	if oca == nil {
  6857  		resp.Error = NewJSConsumerNotFoundError()
  6858  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6859  		return
  6860  	}
  6861  	oca.deleted = true
  6862  	ca := &consumerAssignment{Group: oca.Group, Stream: stream, Name: consumer, Config: oca.Config, Subject: subject, Reply: reply, Client: ci}
  6863  	cc.meta.Propose(encodeDeleteConsumerAssignment(ca))
  6864  }
  6865  
  6866  func encodeMsgDelete(md *streamMsgDelete) []byte {
  6867  	var bb bytes.Buffer
  6868  	bb.WriteByte(byte(deleteMsgOp))
  6869  	json.NewEncoder(&bb).Encode(md)
  6870  	return bb.Bytes()
  6871  }
  6872  
  6873  func decodeMsgDelete(buf []byte) (*streamMsgDelete, error) {
  6874  	var md streamMsgDelete
  6875  	err := json.Unmarshal(buf, &md)
  6876  	return &md, err
  6877  }
  6878  
  6879  func (s *Server) jsClusteredMsgDeleteRequest(ci *ClientInfo, acc *Account, mset *stream, stream, subject, reply string, req *JSApiMsgDeleteRequest, rmsg []byte) {
  6880  	js, cc := s.getJetStreamCluster()
  6881  	if js == nil || cc == nil {
  6882  		return
  6883  	}
  6884  
  6885  	js.mu.Lock()
  6886  	sa := js.streamAssignment(acc.Name, stream)
  6887  	if sa == nil {
  6888  		s.Debugf("Message delete failed, could not locate stream '%s > %s'", acc.Name, stream)
  6889  		js.mu.Unlock()
  6890  		return
  6891  	}
  6892  
  6893  	// Check for single replica items.
  6894  	if n := sa.Group.node; n != nil {
  6895  		md := streamMsgDelete{Seq: req.Seq, NoErase: req.NoErase, Stream: stream, Subject: subject, Reply: reply, Client: ci}
  6896  		n.Propose(encodeMsgDelete(&md))
  6897  		js.mu.Unlock()
  6898  		return
  6899  	}
  6900  	js.mu.Unlock()
  6901  
  6902  	if mset == nil {
  6903  		return
  6904  	}
  6905  
  6906  	var err error
  6907  	var removed bool
  6908  	if req.NoErase {
  6909  		removed, err = mset.removeMsg(req.Seq)
  6910  	} else {
  6911  		removed, err = mset.eraseMsg(req.Seq)
  6912  	}
  6913  	var resp = JSApiMsgDeleteResponse{ApiResponse: ApiResponse{Type: JSApiMsgDeleteResponseType}}
  6914  	if err != nil {
  6915  		resp.Error = NewJSStreamMsgDeleteFailedError(err, Unless(err))
  6916  	} else if !removed {
  6917  		resp.Error = NewJSSequenceNotFoundError(req.Seq)
  6918  	} else {
  6919  		resp.Success = true
  6920  	}
  6921  	s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6922  }
  6923  
  6924  func encodeAddStreamAssignment(sa *streamAssignment) []byte {
  6925  	var bb bytes.Buffer
  6926  	bb.WriteByte(byte(assignStreamOp))
  6927  	json.NewEncoder(&bb).Encode(sa)
  6928  	return bb.Bytes()
  6929  }
  6930  
  6931  func encodeUpdateStreamAssignment(sa *streamAssignment) []byte {
  6932  	var bb bytes.Buffer
  6933  	bb.WriteByte(byte(updateStreamOp))
  6934  	json.NewEncoder(&bb).Encode(sa)
  6935  	return bb.Bytes()
  6936  }
  6937  
  6938  func encodeDeleteStreamAssignment(sa *streamAssignment) []byte {
  6939  	var bb bytes.Buffer
  6940  	bb.WriteByte(byte(removeStreamOp))
  6941  	json.NewEncoder(&bb).Encode(sa)
  6942  	return bb.Bytes()
  6943  }
  6944  
  6945  func decodeStreamAssignment(buf []byte) (*streamAssignment, error) {
  6946  	var sa streamAssignment
  6947  	err := json.Unmarshal(buf, &sa)
  6948  	if err != nil {
  6949  		return nil, err
  6950  	}
  6951  	fixCfgMirrorWithDedupWindow(sa.Config)
  6952  	return &sa, err
  6953  }
  6954  
  6955  func encodeDeleteRange(dr *DeleteRange) []byte {
  6956  	var bb bytes.Buffer
  6957  	bb.WriteByte(byte(deleteRangeOp))
  6958  	json.NewEncoder(&bb).Encode(dr)
  6959  	return bb.Bytes()
  6960  }
  6961  
  6962  func decodeDeleteRange(buf []byte) (*DeleteRange, error) {
  6963  	var dr DeleteRange
  6964  	err := json.Unmarshal(buf, &dr)
  6965  	if err != nil {
  6966  		return nil, err
  6967  	}
  6968  	return &dr, err
  6969  }
  6970  
  6971  // createGroupForConsumer will create a new group from same peer set as the stream.
  6972  func (cc *jetStreamCluster) createGroupForConsumer(cfg *ConsumerConfig, sa *streamAssignment) *raftGroup {
  6973  	if len(sa.Group.Peers) == 0 || cfg.Replicas > len(sa.Group.Peers) {
  6974  		return nil
  6975  	}
  6976  
  6977  	peers := copyStrings(sa.Group.Peers)
  6978  	var _ss [5]string
  6979  	active := _ss[:0]
  6980  
  6981  	// Calculate all active peers.
  6982  	for _, peer := range peers {
  6983  		if sir, ok := cc.s.nodeToInfo.Load(peer); ok && sir != nil {
  6984  			if !sir.(nodeInfo).offline {
  6985  				active = append(active, peer)
  6986  			}
  6987  		}
  6988  	}
  6989  	if quorum := cfg.Replicas/2 + 1; quorum > len(active) {
  6990  		// Not enough active to satisfy the request.
  6991  		return nil
  6992  	}
  6993  
  6994  	// If we want less then our parent stream, select from active.
  6995  	if cfg.Replicas > 0 && cfg.Replicas < len(peers) {
  6996  		// Pedantic in case stream is say R5 and consumer is R3 and 3 or more offline, etc.
  6997  		if len(active) < cfg.Replicas {
  6998  			return nil
  6999  		}
  7000  		// First shuffle the active peers and then select to account for replica = 1.
  7001  		rand.Shuffle(len(active), func(i, j int) { active[i], active[j] = active[j], active[i] })
  7002  		peers = active[:cfg.Replicas]
  7003  	}
  7004  	storage := sa.Config.Storage
  7005  	if cfg.MemoryStorage {
  7006  		storage = MemoryStorage
  7007  	}
  7008  	return &raftGroup{Name: groupNameForConsumer(peers, storage), Storage: storage, Peers: peers}
  7009  }
  7010  
  7011  // jsClusteredConsumerRequest is first point of entry to create a consumer with R > 1.
  7012  func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, stream string, cfg *ConsumerConfig, action ConsumerAction) {
  7013  	js, cc := s.getJetStreamCluster()
  7014  	if js == nil || cc == nil {
  7015  		return
  7016  	}
  7017  
  7018  	var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}
  7019  
  7020  	streamCfg, ok := js.clusterStreamConfig(acc.Name, stream)
  7021  	if !ok {
  7022  		resp.Error = NewJSStreamNotFoundError()
  7023  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7024  		return
  7025  	}
  7026  	selectedLimits, _, _, apiErr := acc.selectLimits(&streamCfg)
  7027  	if apiErr != nil {
  7028  		resp.Error = apiErr
  7029  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7030  		return
  7031  	}
  7032  	srvLim := &s.getOpts().JetStreamLimits
  7033  	// Make sure we have sane defaults
  7034  	setConsumerConfigDefaults(cfg, &streamCfg, srvLim, selectedLimits)
  7035  
  7036  	if err := checkConsumerCfg(cfg, srvLim, &streamCfg, acc, selectedLimits, false); err != nil {
  7037  		resp.Error = err
  7038  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7039  		return
  7040  	}
  7041  
  7042  	js.mu.Lock()
  7043  	defer js.mu.Unlock()
  7044  
  7045  	if cc.meta == nil {
  7046  		return
  7047  	}
  7048  
  7049  	// Lookup the stream assignment.
  7050  	sa := js.streamAssignment(acc.Name, stream)
  7051  	if sa == nil {
  7052  		resp.Error = NewJSStreamNotFoundError()
  7053  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7054  		return
  7055  	}
  7056  
  7057  	// Check for max consumers here to short circuit if possible.
  7058  	// Start with limit on a stream, but if one is defined at the level of the account
  7059  	// and is lower, use that limit.
  7060  	maxc := sa.Config.MaxConsumers
  7061  	if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) {
  7062  		maxc = selectedLimits.MaxConsumers
  7063  	}
  7064  	if maxc > 0 {
  7065  		// Don't count DIRECTS.
  7066  		total := 0
  7067  		for _, ca := range sa.consumers {
  7068  			if ca.Config != nil && !ca.Config.Direct {
  7069  				total++
  7070  			}
  7071  		}
  7072  		if total >= maxc {
  7073  			resp.Error = NewJSMaximumConsumersLimitError()
  7074  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7075  			return
  7076  		}
  7077  	}
  7078  
  7079  	// Also short circuit if DeliverLastPerSubject is set with no FilterSubject.
  7080  	if cfg.DeliverPolicy == DeliverLastPerSubject {
  7081  		if cfg.FilterSubject == _EMPTY_ && len(cfg.FilterSubjects) == 0 {
  7082  			resp.Error = NewJSConsumerInvalidPolicyError(fmt.Errorf("consumer delivery policy is deliver last per subject, but FilterSubject is not set"))
  7083  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7084  			return
  7085  		}
  7086  	}
  7087  
  7088  	// Setup proper default for ack wait if we are in explicit ack mode.
  7089  	if cfg.AckWait == 0 && (cfg.AckPolicy == AckExplicit || cfg.AckPolicy == AckAll) {
  7090  		cfg.AckWait = JsAckWaitDefault
  7091  	}
  7092  	// Setup default of -1, meaning no limit for MaxDeliver.
  7093  	if cfg.MaxDeliver == 0 {
  7094  		cfg.MaxDeliver = -1
  7095  	}
  7096  	// Set proper default for max ack pending if we are ack explicit and none has been set.
  7097  	if cfg.AckPolicy == AckExplicit && cfg.MaxAckPending == 0 {
  7098  		cfg.MaxAckPending = JsDefaultMaxAckPending
  7099  	}
  7100  
  7101  	var ca *consumerAssignment
  7102  	var oname string
  7103  
  7104  	// See if we have an existing one already under same durable name or
  7105  	// if name was set by the user.
  7106  	if isDurableConsumer(cfg) || cfg.Name != _EMPTY_ {
  7107  		if cfg.Name != _EMPTY_ {
  7108  			oname = cfg.Name
  7109  		} else {
  7110  			oname = cfg.Durable
  7111  		}
  7112  		if ca = sa.consumers[oname]; ca != nil && !ca.deleted {
  7113  			if action == ActionCreate && !reflect.DeepEqual(cfg, ca.Config) {
  7114  				resp.Error = NewJSConsumerAlreadyExistsError()
  7115  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7116  				return
  7117  			}
  7118  			// Do quick sanity check on new cfg to prevent here if possible.
  7119  			if err := acc.checkNewConsumerConfig(ca.Config, cfg); err != nil {
  7120  				resp.Error = NewJSConsumerCreateError(err, Unless(err))
  7121  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7122  				return
  7123  			}
  7124  		}
  7125  	}
  7126  
  7127  	// If this is new consumer.
  7128  	if ca == nil {
  7129  		if action == ActionUpdate {
  7130  			resp.Error = NewJSConsumerDoesNotExistError()
  7131  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7132  			return
  7133  		}
  7134  		rg := cc.createGroupForConsumer(cfg, sa)
  7135  		if rg == nil {
  7136  			resp.Error = NewJSInsufficientResourcesError()
  7137  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7138  			return
  7139  		}
  7140  		// Pick a preferred leader.
  7141  		rg.setPreferred()
  7142  
  7143  		// Inherit cluster from stream.
  7144  		rg.Cluster = sa.Group.Cluster
  7145  
  7146  		// We need to set the ephemeral here before replicating.
  7147  		if !isDurableConsumer(cfg) {
  7148  			// We chose to have ephemerals be R=1 unless stream is interest or workqueue.
  7149  			// Consumer can override.
  7150  			if sa.Config.Retention == LimitsPolicy && cfg.Replicas <= 1 {
  7151  				rg.Peers = []string{rg.Preferred}
  7152  				rg.Name = groupNameForConsumer(rg.Peers, rg.Storage)
  7153  			}
  7154  			if cfg.Name != _EMPTY_ {
  7155  				oname = cfg.Name
  7156  			} else {
  7157  				// Make sure name is unique.
  7158  				for {
  7159  					oname = createConsumerName()
  7160  					if sa.consumers != nil {
  7161  						if sa.consumers[oname] != nil {
  7162  							continue
  7163  						}
  7164  					}
  7165  					break
  7166  				}
  7167  			}
  7168  		}
  7169  		if len(rg.Peers) > 1 {
  7170  			if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets != 0 {
  7171  				for _, peer := range rg.Peers {
  7172  					if ni, ok := s.nodeToInfo.Load(peer); ok {
  7173  						ni := ni.(nodeInfo)
  7174  						if stats := ni.stats; stats != nil && stats.HAAssets > maxHaAssets {
  7175  							resp.Error = NewJSInsufficientResourcesError()
  7176  							s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7177  							s.Warnf("%s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d"+
  7178  								" for (durable) consumer %s placement on stream %s",
  7179  								ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets, oname, stream)
  7180  							return
  7181  						}
  7182  					}
  7183  				}
  7184  			}
  7185  		}
  7186  		ca = &consumerAssignment{
  7187  			Group:   rg,
  7188  			Stream:  stream,
  7189  			Name:    oname,
  7190  			Config:  cfg,
  7191  			Subject: subject,
  7192  			Reply:   reply,
  7193  			Client:  ci,
  7194  			Created: time.Now().UTC(),
  7195  		}
  7196  	} else {
  7197  		// If the consumer already exists then don't allow updating the PauseUntil, just set
  7198  		// it back to whatever the current configured value is.
  7199  		cfg.PauseUntil = ca.Config.PauseUntil
  7200  
  7201  		nca := ca.copyGroup()
  7202  
  7203  		rBefore := nca.Config.replicas(sa.Config)
  7204  		rAfter := cfg.replicas(sa.Config)
  7205  
  7206  		var curLeader string
  7207  		if rBefore != rAfter {
  7208  			// We are modifying nodes here. We want to do our best to preserve the current leader.
  7209  			// We have support now from above that guarantees we are in our own Go routine, so can
  7210  			// ask for stream info from the stream leader to make sure we keep the leader in the new list.
  7211  			if !s.allPeersOffline(ca.Group) {
  7212  				// Need to release js lock.
  7213  				js.mu.Unlock()
  7214  				if ci, err := sysRequest[ConsumerInfo](s, clusterConsumerInfoT, ci.serviceAccount(), sa.Config.Name, cfg.Durable); err != nil {
  7215  					s.Warnf("Did not receive consumer info results for '%s > %s > %s' due to: %s", acc, sa.Config.Name, cfg.Durable, err)
  7216  				} else if ci != nil {
  7217  					if cl := ci.Cluster; cl != nil {
  7218  						curLeader = getHash(cl.Leader)
  7219  					}
  7220  				}
  7221  				// Re-acquire here.
  7222  				js.mu.Lock()
  7223  			}
  7224  		}
  7225  
  7226  		if rBefore < rAfter {
  7227  			newPeerSet := nca.Group.Peers
  7228  			// scale up by adding new members from the stream peer set that are not yet in the consumer peer set
  7229  			streamPeerSet := copyStrings(sa.Group.Peers)
  7230  			rand.Shuffle(rAfter, func(i, j int) { streamPeerSet[i], streamPeerSet[j] = streamPeerSet[j], streamPeerSet[i] })
  7231  			for _, p := range streamPeerSet {
  7232  				found := false
  7233  				for _, sp := range newPeerSet {
  7234  					if sp == p {
  7235  						found = true
  7236  						break
  7237  					}
  7238  				}
  7239  				if !found {
  7240  					newPeerSet = append(newPeerSet, p)
  7241  					if len(newPeerSet) == rAfter {
  7242  						break
  7243  					}
  7244  				}
  7245  			}
  7246  			nca.Group.Peers = newPeerSet
  7247  			nca.Group.Preferred = curLeader
  7248  		} else if rBefore > rAfter {
  7249  			newPeerSet := nca.Group.Peers
  7250  			// mark leader preferred and move it to end
  7251  			nca.Group.Preferred = curLeader
  7252  			if nca.Group.Preferred != _EMPTY_ {
  7253  				for i, p := range newPeerSet {
  7254  					if nca.Group.Preferred == p {
  7255  						newPeerSet[i] = newPeerSet[len(newPeerSet)-1]
  7256  						newPeerSet[len(newPeerSet)-1] = p
  7257  					}
  7258  				}
  7259  			}
  7260  			// scale down by removing peers from the end
  7261  			newPeerSet = newPeerSet[len(newPeerSet)-rAfter:]
  7262  			nca.Group.Peers = newPeerSet
  7263  		}
  7264  
  7265  		// Update config and client info on copy of existing.
  7266  		nca.Config = cfg
  7267  		nca.Client = ci
  7268  		nca.Subject = subject
  7269  		nca.Reply = reply
  7270  		ca = nca
  7271  	}
  7272  
  7273  	eca := encodeAddConsumerAssignment(ca)
  7274  
  7275  	// Mark this as pending.
  7276  	if sa.consumers == nil {
  7277  		sa.consumers = make(map[string]*consumerAssignment)
  7278  	}
  7279  	sa.consumers[ca.Name] = ca
  7280  
  7281  	// Do formal proposal.
  7282  	cc.meta.Propose(eca)
  7283  }
  7284  
  7285  func encodeAddConsumerAssignment(ca *consumerAssignment) []byte {
  7286  	var bb bytes.Buffer
  7287  	bb.WriteByte(byte(assignConsumerOp))
  7288  	json.NewEncoder(&bb).Encode(ca)
  7289  	return bb.Bytes()
  7290  }
  7291  
  7292  func encodeDeleteConsumerAssignment(ca *consumerAssignment) []byte {
  7293  	var bb bytes.Buffer
  7294  	bb.WriteByte(byte(removeConsumerOp))
  7295  	json.NewEncoder(&bb).Encode(ca)
  7296  	return bb.Bytes()
  7297  }
  7298  
  7299  func decodeConsumerAssignment(buf []byte) (*consumerAssignment, error) {
  7300  	var ca consumerAssignment
  7301  	err := json.Unmarshal(buf, &ca)
  7302  	return &ca, err
  7303  }
  7304  
  7305  func encodeAddConsumerAssignmentCompressed(ca *consumerAssignment) []byte {
  7306  	b, err := json.Marshal(ca)
  7307  	if err != nil {
  7308  		return nil
  7309  	}
  7310  	// TODO(dlc) - Streaming better approach here probably.
  7311  	var bb bytes.Buffer
  7312  	bb.WriteByte(byte(assignCompressedConsumerOp))
  7313  	bb.Write(s2.Encode(nil, b))
  7314  	return bb.Bytes()
  7315  }
  7316  
  7317  func decodeConsumerAssignmentCompressed(buf []byte) (*consumerAssignment, error) {
  7318  	var ca consumerAssignment
  7319  	js, err := s2.Decode(nil, buf)
  7320  	if err != nil {
  7321  		return nil, err
  7322  	}
  7323  	err = json.Unmarshal(js, &ca)
  7324  	return &ca, err
  7325  }
  7326  
  7327  var errBadStreamMsg = errors.New("jetstream cluster bad replicated stream msg")
  7328  
  7329  func decodeStreamMsg(buf []byte) (subject, reply string, hdr, msg []byte, lseq uint64, ts int64, err error) {
  7330  	var le = binary.LittleEndian
  7331  	if len(buf) < 26 {
  7332  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7333  	}
  7334  	lseq = le.Uint64(buf)
  7335  	buf = buf[8:]
  7336  	ts = int64(le.Uint64(buf))
  7337  	buf = buf[8:]
  7338  	sl := int(le.Uint16(buf))
  7339  	buf = buf[2:]
  7340  	if len(buf) < sl {
  7341  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7342  	}
  7343  	subject = string(buf[:sl])
  7344  	buf = buf[sl:]
  7345  	if len(buf) < 2 {
  7346  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7347  	}
  7348  	rl := int(le.Uint16(buf))
  7349  	buf = buf[2:]
  7350  	if len(buf) < rl {
  7351  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7352  	}
  7353  	reply = string(buf[:rl])
  7354  	buf = buf[rl:]
  7355  	if len(buf) < 2 {
  7356  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7357  	}
  7358  	hl := int(le.Uint16(buf))
  7359  	buf = buf[2:]
  7360  	if len(buf) < hl {
  7361  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7362  	}
  7363  	if hdr = buf[:hl]; len(hdr) == 0 {
  7364  		hdr = nil
  7365  	}
  7366  	buf = buf[hl:]
  7367  	if len(buf) < 4 {
  7368  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7369  	}
  7370  	ml := int(le.Uint32(buf))
  7371  	buf = buf[4:]
  7372  	if len(buf) < ml {
  7373  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7374  	}
  7375  	if msg = buf[:ml]; len(msg) == 0 {
  7376  		msg = nil
  7377  	}
  7378  	return subject, reply, hdr, msg, lseq, ts, nil
  7379  }
  7380  
  7381  // Helper to return if compression allowed.
  7382  func (mset *stream) compressAllowed() bool {
  7383  	mset.clMu.Lock()
  7384  	defer mset.clMu.Unlock()
  7385  	return mset.compressOK
  7386  }
  7387  
  7388  func encodeStreamMsg(subject, reply string, hdr, msg []byte, lseq uint64, ts int64) []byte {
  7389  	return encodeStreamMsgAllowCompress(subject, reply, hdr, msg, lseq, ts, false)
  7390  }
  7391  
  7392  // Threshold for compression.
  7393  // TODO(dlc) - Eventually make configurable.
  7394  const compressThreshold = 256
  7395  
  7396  // If allowed and contents over the threshold we will compress.
  7397  func encodeStreamMsgAllowCompress(subject, reply string, hdr, msg []byte, lseq uint64, ts int64, compressOK bool) []byte {
  7398  	shouldCompress := compressOK && len(subject)+len(reply)+len(hdr)+len(msg) > compressThreshold
  7399  
  7400  	elen := 1 + 8 + 8 + len(subject) + len(reply) + len(hdr) + len(msg)
  7401  	elen += (2 + 2 + 2 + 4) // Encoded lengths, 4bytes
  7402  	// TODO(dlc) - check sizes of subject, reply and hdr, make sure uint16 ok.
  7403  	buf := make([]byte, elen)
  7404  	buf[0] = byte(streamMsgOp)
  7405  	var le = binary.LittleEndian
  7406  	wi := 1
  7407  	le.PutUint64(buf[wi:], lseq)
  7408  	wi += 8
  7409  	le.PutUint64(buf[wi:], uint64(ts))
  7410  	wi += 8
  7411  	le.PutUint16(buf[wi:], uint16(len(subject)))
  7412  	wi += 2
  7413  	copy(buf[wi:], subject)
  7414  	wi += len(subject)
  7415  	le.PutUint16(buf[wi:], uint16(len(reply)))
  7416  	wi += 2
  7417  	copy(buf[wi:], reply)
  7418  	wi += len(reply)
  7419  	le.PutUint16(buf[wi:], uint16(len(hdr)))
  7420  	wi += 2
  7421  	if len(hdr) > 0 {
  7422  		copy(buf[wi:], hdr)
  7423  		wi += len(hdr)
  7424  	}
  7425  	le.PutUint32(buf[wi:], uint32(len(msg)))
  7426  	wi += 4
  7427  	if len(msg) > 0 {
  7428  		copy(buf[wi:], msg)
  7429  		wi += len(msg)
  7430  	}
  7431  
  7432  	// Check if we should compress.
  7433  	if shouldCompress {
  7434  		nbuf := make([]byte, s2.MaxEncodedLen(elen))
  7435  		nbuf[0] = byte(compressedStreamMsgOp)
  7436  		ebuf := s2.Encode(nbuf[1:], buf[1:wi])
  7437  		// Only pay cost of decode the other side if we compressed.
  7438  		// S2 will allow us to try without major penalty for non-compressable data.
  7439  		if len(ebuf) < wi {
  7440  			nbuf = nbuf[:len(ebuf)+1]
  7441  			buf, wi = nbuf, len(nbuf)
  7442  		}
  7443  	}
  7444  
  7445  	return buf[:wi]
  7446  }
  7447  
  7448  // Determine if all peers in our set support the binary snapshot.
  7449  func (mset *stream) supportsBinarySnapshot() bool {
  7450  	mset.mu.RLock()
  7451  	defer mset.mu.RUnlock()
  7452  	return mset.supportsBinarySnapshotLocked()
  7453  }
  7454  
  7455  // Determine if all peers in our set support the binary snapshot.
  7456  // Lock should be held.
  7457  func (mset *stream) supportsBinarySnapshotLocked() bool {
  7458  	s, n := mset.srv, mset.node
  7459  	if s == nil || n == nil {
  7460  		return false
  7461  	}
  7462  	// Grab our peers and walk them to make sure we can all support binary stream snapshots.
  7463  	id, peers := n.ID(), n.Peers()
  7464  	for _, p := range peers {
  7465  		if p.ID == id {
  7466  			// We know we support ourselves.
  7467  			continue
  7468  		}
  7469  		if sir, ok := s.nodeToInfo.Load(p.ID); !ok || sir == nil || !sir.(nodeInfo).binarySnapshots {
  7470  			return false
  7471  		}
  7472  	}
  7473  	return true
  7474  }
  7475  
  7476  // StreamSnapshot is used for snapshotting and out of band catch up in clustered mode.
  7477  // Legacy, replace with binary stream snapshots.
  7478  type streamSnapshot struct {
  7479  	Msgs     uint64   `json:"messages"`
  7480  	Bytes    uint64   `json:"bytes"`
  7481  	FirstSeq uint64   `json:"first_seq"`
  7482  	LastSeq  uint64   `json:"last_seq"`
  7483  	Failed   uint64   `json:"clfs"`
  7484  	Deleted  []uint64 `json:"deleted,omitempty"`
  7485  }
  7486  
  7487  // Grab a snapshot of a stream for clustered mode.
  7488  func (mset *stream) stateSnapshot() []byte {
  7489  	mset.mu.RLock()
  7490  	defer mset.mu.RUnlock()
  7491  	return mset.stateSnapshotLocked()
  7492  }
  7493  
  7494  // Grab a snapshot of a stream for clustered mode.
  7495  // Lock should be held.
  7496  func (mset *stream) stateSnapshotLocked() []byte {
  7497  	// Decide if we can support the new style of stream snapshots.
  7498  	if mset.supportsBinarySnapshotLocked() {
  7499  		snap, _ := mset.store.EncodedStreamState(mset.getCLFS())
  7500  		return snap
  7501  	}
  7502  
  7503  	// Older v1 version with deleted as a sorted []uint64.
  7504  	state := mset.store.State()
  7505  	snap := &streamSnapshot{
  7506  		Msgs:     state.Msgs,
  7507  		Bytes:    state.Bytes,
  7508  		FirstSeq: state.FirstSeq,
  7509  		LastSeq:  state.LastSeq,
  7510  		Failed:   mset.getCLFS(),
  7511  		Deleted:  state.Deleted,
  7512  	}
  7513  	b, _ := json.Marshal(snap)
  7514  	return b
  7515  }
  7516  
  7517  // Will check if we can do message compression in RAFT and catchup logic.
  7518  func (mset *stream) checkAllowMsgCompress(peers []string) {
  7519  	allowed := true
  7520  	for _, id := range peers {
  7521  		sir, ok := mset.srv.nodeToInfo.Load(id)
  7522  		if !ok || sir == nil {
  7523  			allowed = false
  7524  			break
  7525  		}
  7526  		// Check for capability.
  7527  		if si := sir.(nodeInfo); si.cfg == nil || !si.cfg.CompressOK {
  7528  			allowed = false
  7529  			break
  7530  		}
  7531  	}
  7532  	mset.mu.Lock()
  7533  	mset.compressOK = allowed
  7534  	mset.mu.Unlock()
  7535  }
  7536  
  7537  // To warn when we are getting too far behind from what has been proposed vs what has been committed.
  7538  const streamLagWarnThreshold = 10_000
  7539  
  7540  // processClusteredMsg will propose the inbound message to the underlying raft group.
  7541  func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg []byte, mt *msgTrace) (retErr error) {
  7542  	// For possible error response.
  7543  	var response []byte
  7544  
  7545  	mset.mu.RLock()
  7546  	canRespond := !mset.cfg.NoAck && len(reply) > 0
  7547  	name, stype, store := mset.cfg.Name, mset.cfg.Storage, mset.store
  7548  	s, js, jsa, st, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node
  7549  	maxMsgSize, lseq, clfs := int(mset.cfg.MaxMsgSize), mset.lseq, mset.clfs
  7550  	interestPolicy, discard, maxMsgs, maxBytes := mset.cfg.Retention != LimitsPolicy, mset.cfg.Discard, mset.cfg.MaxMsgs, mset.cfg.MaxBytes
  7551  	isLeader, isSealed := mset.isLeader(), mset.cfg.Sealed
  7552  
  7553  	// We need to track state to check limits if interest retention and discard new with max msgs or bytes.
  7554  	var state StreamState
  7555  	if interestPolicy && discard == DiscardNew && (maxMsgs > 0 || maxBytes > 0) {
  7556  		mset.store.FastState(&state)
  7557  	}
  7558  	mset.mu.RUnlock()
  7559  
  7560  	// This should not happen but possible now that we allow scale up, and scale down where this could trigger.
  7561  	//
  7562  	// We also invoke this in clustering mode for message tracing when not
  7563  	// performing message delivery.
  7564  	if node == nil || mt.traceOnly() {
  7565  		return mset.processJetStreamMsg(subject, reply, hdr, msg, 0, 0, mt)
  7566  	}
  7567  
  7568  	// If message tracing (with message delivery), we will need to send the
  7569  	// event on exit in case there was an error (if message was not proposed).
  7570  	// Otherwise, the event will be sent from processJetStreamMsg when
  7571  	// invoked by the leader (from applyStreamEntries).
  7572  	if mt != nil {
  7573  		defer func() {
  7574  			if retErr != nil {
  7575  				mt.sendEventFromJetStream(retErr)
  7576  			}
  7577  		}()
  7578  	}
  7579  
  7580  	// Check that we are the leader. This can be false if we have scaled up from an R1 that had inbound queued messages.
  7581  	if !isLeader {
  7582  		return NewJSClusterNotLeaderError()
  7583  	}
  7584  
  7585  	// Bail here if sealed.
  7586  	if isSealed {
  7587  		var resp = JSPubAckResponse{PubAck: &PubAck{Stream: mset.name()}, Error: NewJSStreamSealedError()}
  7588  		b, _ := json.Marshal(resp)
  7589  		mset.outq.sendMsg(reply, b)
  7590  		return NewJSStreamSealedError()
  7591  	}
  7592  
  7593  	// Check here pre-emptively if we have exceeded this server limits.
  7594  	if js.limitsExceeded(stype) {
  7595  		s.resourcesExceededError()
  7596  		if canRespond {
  7597  			b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: NewJSInsufficientResourcesError()})
  7598  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0))
  7599  		}
  7600  		// Stepdown regardless.
  7601  		if node := mset.raftNode(); node != nil {
  7602  			node.StepDown()
  7603  		}
  7604  		return NewJSInsufficientResourcesError()
  7605  	}
  7606  
  7607  	// Check here pre-emptively if we have exceeded our account limits.
  7608  	if exceeded, err := jsa.wouldExceedLimits(st, tierName, r, subject, hdr, msg); exceeded {
  7609  		if err == nil {
  7610  			err = NewJSAccountResourcesExceededError()
  7611  		}
  7612  		s.RateLimitWarnf(err.Error())
  7613  		if canRespond {
  7614  			var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7615  			resp.Error = err
  7616  			response, _ = json.Marshal(resp)
  7617  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7618  		}
  7619  		return err
  7620  	}
  7621  
  7622  	// Check msgSize if we have a limit set there. Again this works if it goes through but better to be pre-emptive.
  7623  	if maxMsgSize >= 0 && (len(hdr)+len(msg)) > maxMsgSize {
  7624  		err := fmt.Errorf("JetStream message size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name)
  7625  		s.RateLimitWarnf(err.Error())
  7626  		if canRespond {
  7627  			var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7628  			resp.Error = NewJSStreamMessageExceedsMaximumError()
  7629  			response, _ = json.Marshal(resp)
  7630  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7631  		}
  7632  		return err
  7633  	}
  7634  
  7635  	// Some header checks can be checked pre proposal. Most can not.
  7636  	if len(hdr) > 0 {
  7637  		// Expected last sequence per subject.
  7638  		// We can check for last sequence per subject but only if the expected seq <= lseq.
  7639  		if seq, exists := getExpectedLastSeqPerSubject(hdr); exists && store != nil && seq > 0 && seq <= lseq {
  7640  			var smv StoreMsg
  7641  			var fseq uint64
  7642  			sm, err := store.LoadLastMsg(subject, &smv)
  7643  			if sm != nil {
  7644  				fseq = sm.seq
  7645  			}
  7646  			if err != nil || fseq != seq {
  7647  				if canRespond {
  7648  					var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7649  					resp.PubAck = &PubAck{Stream: name}
  7650  					resp.Error = NewJSStreamWrongLastSequenceError(fseq)
  7651  					b, _ := json.Marshal(resp)
  7652  					outq.sendMsg(reply, b)
  7653  				}
  7654  				return fmt.Errorf("last sequence by subject mismatch: %d vs %d", seq, fseq)
  7655  			}
  7656  		}
  7657  		// Expected stream name can also be pre-checked.
  7658  		if sname := getExpectedStream(hdr); sname != _EMPTY_ && sname != name {
  7659  			if canRespond {
  7660  				var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7661  				resp.PubAck = &PubAck{Stream: name}
  7662  				resp.Error = NewJSStreamNotMatchError()
  7663  				b, _ := json.Marshal(resp)
  7664  				outq.sendMsg(reply, b)
  7665  			}
  7666  			return errors.New("expected stream does not match")
  7667  		}
  7668  	}
  7669  
  7670  	// Since we encode header len as u16 make sure we do not exceed.
  7671  	// Again this works if it goes through but better to be pre-emptive.
  7672  	if len(hdr) > math.MaxUint16 {
  7673  		err := fmt.Errorf("JetStream header size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name)
  7674  		s.RateLimitWarnf(err.Error())
  7675  		if canRespond {
  7676  			var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7677  			resp.Error = NewJSStreamHeaderExceedsMaximumError()
  7678  			response, _ = json.Marshal(resp)
  7679  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7680  		}
  7681  		return err
  7682  	}
  7683  
  7684  	// Proceed with proposing this message.
  7685  
  7686  	// We only use mset.clseq for clustering and in case we run ahead of actual commits.
  7687  	// Check if we need to set initial value here
  7688  	mset.clMu.Lock()
  7689  	if mset.clseq == 0 || mset.clseq < lseq {
  7690  		// Re-capture
  7691  		lseq, clfs = mset.lseq, mset.clfs
  7692  		mset.clseq = lseq + clfs
  7693  	}
  7694  
  7695  	// Check if we have an interest policy and discard new with max msgs or bytes.
  7696  	// We need to deny here otherwise it could succeed on some peers and not others
  7697  	// depending on consumer ack state. So we deny here, if we allow that means we know
  7698  	// it would succeed on every peer.
  7699  	if interestPolicy && discard == DiscardNew && (maxMsgs > 0 || maxBytes > 0) {
  7700  		// Track inflight.
  7701  		if mset.inflight == nil {
  7702  			mset.inflight = make(map[uint64]uint64)
  7703  		}
  7704  		if mset.cfg.Storage == FileStorage {
  7705  			mset.inflight[mset.clseq] = fileStoreMsgSize(subject, hdr, msg)
  7706  		} else {
  7707  			mset.inflight[mset.clseq] = memStoreMsgSize(subject, hdr, msg)
  7708  		}
  7709  
  7710  		var err error
  7711  		if maxMsgs > 0 && state.Msgs+uint64(len(mset.inflight)) > uint64(maxMsgs) {
  7712  			err = ErrMaxMsgs
  7713  		} else if maxBytes > 0 {
  7714  			// TODO(dlc) - Could track this rollup independently.
  7715  			var bytesPending uint64
  7716  			for _, nb := range mset.inflight {
  7717  				bytesPending += nb
  7718  			}
  7719  			if state.Bytes+bytesPending > uint64(maxBytes) {
  7720  				err = ErrMaxBytes
  7721  			}
  7722  		}
  7723  		if err != nil {
  7724  			delete(mset.inflight, mset.clseq)
  7725  			mset.clMu.Unlock()
  7726  			if canRespond {
  7727  				var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7728  				resp.Error = NewJSStreamStoreFailedError(err, Unless(err))
  7729  				response, _ = json.Marshal(resp)
  7730  				outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7731  			}
  7732  			return err
  7733  		}
  7734  	}
  7735  
  7736  	esm := encodeStreamMsgAllowCompress(subject, reply, hdr, msg, mset.clseq, time.Now().UnixNano(), mset.compressOK)
  7737  	var mtKey uint64
  7738  	if mt != nil {
  7739  		mtKey = mset.clseq
  7740  		if mset.mt == nil {
  7741  			mset.mt = make(map[uint64]*msgTrace)
  7742  		}
  7743  		mset.mt[mtKey] = mt
  7744  	}
  7745  	mset.clseq++
  7746  
  7747  	// Do proposal.
  7748  	err := node.Propose(esm)
  7749  	if err != nil && mset.clseq > 0 {
  7750  		mset.clseq--
  7751  	}
  7752  
  7753  	// Check to see if we are being overrun.
  7754  	// TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured.
  7755  	if mset.clseq-(lseq+clfs) > streamLagWarnThreshold {
  7756  		lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name)
  7757  		s.RateLimitWarnf(lerr.Error())
  7758  	}
  7759  	mset.clMu.Unlock()
  7760  
  7761  	if err != nil {
  7762  		if mt != nil {
  7763  			mset.getAndDeleteMsgTrace(mtKey)
  7764  		}
  7765  		if canRespond {
  7766  			var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: mset.cfg.Name}}
  7767  			resp.Error = &ApiError{Code: 503, Description: err.Error()}
  7768  			response, _ = json.Marshal(resp)
  7769  			// If we errored out respond here.
  7770  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7771  		}
  7772  	}
  7773  
  7774  	if err != nil && isOutOfSpaceErr(err) {
  7775  		s.handleOutOfSpace(mset)
  7776  	}
  7777  
  7778  	return err
  7779  }
  7780  
  7781  func (mset *stream) getAndDeleteMsgTrace(lseq uint64) *msgTrace {
  7782  	if mset == nil {
  7783  		return nil
  7784  	}
  7785  	mset.clMu.Lock()
  7786  	mt, ok := mset.mt[lseq]
  7787  	if ok {
  7788  		delete(mset.mt, lseq)
  7789  	}
  7790  	mset.clMu.Unlock()
  7791  	return mt
  7792  }
  7793  
  7794  // For requesting messages post raft snapshot to catch up streams post server restart.
  7795  // Any deleted msgs etc will be handled inline on catchup.
  7796  type streamSyncRequest struct {
  7797  	Peer           string `json:"peer,omitempty"`
  7798  	FirstSeq       uint64 `json:"first_seq"`
  7799  	LastSeq        uint64 `json:"last_seq"`
  7800  	DeleteRangesOk bool   `json:"delete_ranges"`
  7801  }
  7802  
  7803  // Given a stream state that represents a snapshot, calculate the sync request based on our current state.
  7804  func (mset *stream) calculateSyncRequest(state *StreamState, snap *StreamReplicatedState) *streamSyncRequest {
  7805  	// Quick check if we are already caught up.
  7806  	if state.LastSeq >= snap.LastSeq {
  7807  		return nil
  7808  	}
  7809  	return &streamSyncRequest{FirstSeq: state.LastSeq + 1, LastSeq: snap.LastSeq, Peer: mset.node.ID(), DeleteRangesOk: true}
  7810  }
  7811  
  7812  // processSnapshotDeletes will update our current store based on the snapshot
  7813  // but only processing deletes and new FirstSeq / purges.
  7814  func (mset *stream) processSnapshotDeletes(snap *StreamReplicatedState) {
  7815  	mset.mu.Lock()
  7816  	var state StreamState
  7817  	mset.store.FastState(&state)
  7818  	// Always adjust if FirstSeq has moved beyond our state.
  7819  	var didReset bool
  7820  	if snap.FirstSeq > state.FirstSeq {
  7821  		mset.store.Compact(snap.FirstSeq)
  7822  		mset.store.FastState(&state)
  7823  		mset.lseq = state.LastSeq
  7824  		mset.clearAllPreAcksBelowFloor(state.FirstSeq)
  7825  		didReset = true
  7826  	}
  7827  	s := mset.srv
  7828  	mset.mu.Unlock()
  7829  
  7830  	if didReset {
  7831  		s.Warnf("Catchup for stream '%s > %s' resetting first sequence: %d on catchup request",
  7832  			mset.account(), mset.name(), snap.FirstSeq)
  7833  	}
  7834  
  7835  	if len(snap.Deleted) > 0 {
  7836  		mset.store.SyncDeleted(snap.Deleted)
  7837  	}
  7838  }
  7839  
  7840  func (mset *stream) setCatchupPeer(peer string, lag uint64) {
  7841  	if peer == _EMPTY_ {
  7842  		return
  7843  	}
  7844  	mset.mu.Lock()
  7845  	if mset.catchups == nil {
  7846  		mset.catchups = make(map[string]uint64)
  7847  	}
  7848  	mset.catchups[peer] = lag
  7849  	mset.mu.Unlock()
  7850  }
  7851  
  7852  // Will decrement by one.
  7853  func (mset *stream) updateCatchupPeer(peer string) {
  7854  	if peer == _EMPTY_ {
  7855  		return
  7856  	}
  7857  	mset.mu.Lock()
  7858  	if lag := mset.catchups[peer]; lag > 0 {
  7859  		mset.catchups[peer] = lag - 1
  7860  	}
  7861  	mset.mu.Unlock()
  7862  }
  7863  
  7864  func (mset *stream) decrementCatchupPeer(peer string, num uint64) {
  7865  	if peer == _EMPTY_ {
  7866  		return
  7867  	}
  7868  	mset.mu.Lock()
  7869  	if lag := mset.catchups[peer]; lag > 0 {
  7870  		if lag >= num {
  7871  			lag -= num
  7872  		} else {
  7873  			lag = 0
  7874  		}
  7875  		mset.catchups[peer] = lag
  7876  	}
  7877  	mset.mu.Unlock()
  7878  }
  7879  
  7880  func (mset *stream) clearCatchupPeer(peer string) {
  7881  	mset.mu.Lock()
  7882  	if mset.catchups != nil {
  7883  		delete(mset.catchups, peer)
  7884  	}
  7885  	mset.mu.Unlock()
  7886  }
  7887  
  7888  // Lock should be held.
  7889  func (mset *stream) clearAllCatchupPeers() {
  7890  	if mset.catchups != nil {
  7891  		mset.catchups = nil
  7892  	}
  7893  }
  7894  
  7895  func (mset *stream) lagForCatchupPeer(peer string) uint64 {
  7896  	mset.mu.RLock()
  7897  	defer mset.mu.RUnlock()
  7898  	if mset.catchups == nil {
  7899  		return 0
  7900  	}
  7901  	return mset.catchups[peer]
  7902  }
  7903  
  7904  func (mset *stream) hasCatchupPeers() bool {
  7905  	mset.mu.RLock()
  7906  	defer mset.mu.RUnlock()
  7907  	return len(mset.catchups) > 0
  7908  }
  7909  
  7910  func (mset *stream) setCatchingUp() {
  7911  	mset.catchup.Store(true)
  7912  }
  7913  
  7914  func (mset *stream) clearCatchingUp() {
  7915  	mset.catchup.Store(false)
  7916  }
  7917  
  7918  func (mset *stream) isCatchingUp() bool {
  7919  	return mset.catchup.Load()
  7920  }
  7921  
  7922  // Determine if a non-leader is current.
  7923  // Lock should be held.
  7924  func (mset *stream) isCurrent() bool {
  7925  	if mset.node == nil {
  7926  		return true
  7927  	}
  7928  	return mset.node.Current() && !mset.catchup.Load()
  7929  }
  7930  
  7931  // Maximum requests for the whole server that can be in flight at the same time.
  7932  const maxConcurrentSyncRequests = 16
  7933  
  7934  var (
  7935  	errCatchupCorruptSnapshot = errors.New("corrupt stream snapshot detected")
  7936  	errCatchupStalled         = errors.New("catchup stalled")
  7937  	errCatchupStreamStopped   = errors.New("stream has been stopped") // when a catchup is terminated due to the stream going away.
  7938  	errCatchupBadMsg          = errors.New("bad catchup msg")
  7939  	errCatchupWrongSeqForSkip = errors.New("wrong sequence for skipped msg")
  7940  )
  7941  
  7942  // Process a stream snapshot.
  7943  func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) {
  7944  	// Update any deletes, etc.
  7945  	mset.processSnapshotDeletes(snap)
  7946  
  7947  	mset.mu.Lock()
  7948  	var state StreamState
  7949  	mset.store.FastState(&state)
  7950  	mset.setCLFS(snap.Failed)
  7951  	sreq := mset.calculateSyncRequest(&state, snap)
  7952  
  7953  	s, js, subject, n, st := mset.srv, mset.js, mset.sa.Sync, mset.node, mset.cfg.Storage
  7954  	qname := fmt.Sprintf("[ACC:%s] stream '%s' snapshot", mset.acc.Name, mset.cfg.Name)
  7955  	mset.mu.Unlock()
  7956  
  7957  	// Bug that would cause this to be empty on stream update.
  7958  	if subject == _EMPTY_ {
  7959  		return errCatchupCorruptSnapshot
  7960  	}
  7961  
  7962  	// Just return if up to date or already exceeded limits.
  7963  	if sreq == nil || js.limitsExceeded(st) {
  7964  		return nil
  7965  	}
  7966  
  7967  	// Pause the apply channel for our raft group while we catch up.
  7968  	if err := n.PauseApply(); err != nil {
  7969  		return err
  7970  	}
  7971  
  7972  	defer func() {
  7973  		// Don't bother resuming if server or stream is gone.
  7974  		if e != errCatchupStreamStopped && e != ErrServerNotRunning {
  7975  			n.ResumeApply()
  7976  		}
  7977  	}()
  7978  
  7979  	// Set our catchup state.
  7980  	mset.setCatchingUp()
  7981  	defer mset.clearCatchingUp()
  7982  
  7983  	var sub *subscription
  7984  	var err error
  7985  
  7986  	const activityInterval = 30 * time.Second
  7987  	notActive := time.NewTimer(activityInterval)
  7988  	defer notActive.Stop()
  7989  
  7990  	defer func() {
  7991  		if sub != nil {
  7992  			s.sysUnsubscribe(sub)
  7993  		}
  7994  		// Make sure any consumers are updated for the pending amounts.
  7995  		mset.mu.Lock()
  7996  		for _, o := range mset.consumers {
  7997  			o.mu.Lock()
  7998  			if o.isLeader() {
  7999  				o.streamNumPending()
  8000  			}
  8001  			o.mu.Unlock()
  8002  		}
  8003  		mset.mu.Unlock()
  8004  	}()
  8005  
  8006  	var releaseSem bool
  8007  	releaseSyncOutSem := func() {
  8008  		if !releaseSem {
  8009  			return
  8010  		}
  8011  		// Need to use select for the server shutdown case.
  8012  		select {
  8013  		case s.syncOutSem <- struct{}{}:
  8014  		default:
  8015  		}
  8016  		releaseSem = false
  8017  	}
  8018  	// On exit, we will release our semaphore if we acquired it.
  8019  	defer releaseSyncOutSem()
  8020  
  8021  	// Check our final state when we exit cleanly.
  8022  	// This will make sure we have interest consumers updated.
  8023  	checkFinalState := func() {
  8024  		// Bail if no stream.
  8025  		if mset == nil {
  8026  			return
  8027  		}
  8028  		mset.mu.RLock()
  8029  		consumers := make([]*consumer, 0, len(mset.consumers))
  8030  		for _, o := range mset.consumers {
  8031  			consumers = append(consumers, o)
  8032  		}
  8033  		mset.mu.RUnlock()
  8034  		for _, o := range consumers {
  8035  			o.checkStateForInterestStream()
  8036  		}
  8037  	}
  8038  
  8039  	// Do not let this go on forever.
  8040  	const maxRetries = 3
  8041  	var numRetries int
  8042  
  8043  RETRY:
  8044  	// On retry, we need to release the semaphore we got. Call will be no-op
  8045  	// if releaseSem boolean has not been set to true on successfully getting
  8046  	// the semaphore.
  8047  	releaseSyncOutSem()
  8048  
  8049  	if n.GroupLeader() == _EMPTY_ {
  8050  		return fmt.Errorf("catchup for stream '%s > %s' aborted, no leader", mset.account(), mset.name())
  8051  	}
  8052  
  8053  	// If we have a sub clear that here.
  8054  	if sub != nil {
  8055  		s.sysUnsubscribe(sub)
  8056  		sub = nil
  8057  	}
  8058  
  8059  	if !s.isRunning() {
  8060  		return ErrServerNotRunning
  8061  	}
  8062  
  8063  	numRetries++
  8064  	if numRetries >= maxRetries {
  8065  		// Force a hard reset here.
  8066  		return errFirstSequenceMismatch
  8067  	}
  8068  
  8069  	// Block here if we have too many requests in flight.
  8070  	<-s.syncOutSem
  8071  	releaseSem = true
  8072  
  8073  	// We may have been blocked for a bit, so the reset needs to ensure that we
  8074  	// consume the already fired timer.
  8075  	if !notActive.Stop() {
  8076  		select {
  8077  		case <-notActive.C:
  8078  		default:
  8079  		}
  8080  	}
  8081  	notActive.Reset(activityInterval)
  8082  
  8083  	// Grab sync request again on failures.
  8084  	if sreq == nil {
  8085  		mset.mu.RLock()
  8086  		var state StreamState
  8087  		mset.store.FastState(&state)
  8088  		sreq = mset.calculateSyncRequest(&state, snap)
  8089  		mset.mu.RUnlock()
  8090  		if sreq == nil {
  8091  			return nil
  8092  		}
  8093  	}
  8094  
  8095  	// Used to transfer message from the wire to another Go routine internally.
  8096  	type im struct {
  8097  		msg   []byte
  8098  		reply string
  8099  	}
  8100  	// This is used to notify the leader that it should stop the runCatchup
  8101  	// because we are either bailing out or going to retry due to an error.
  8102  	notifyLeaderStopCatchup := func(mrec *im, err error) {
  8103  		if mrec.reply == _EMPTY_ {
  8104  			return
  8105  		}
  8106  		s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, err.Error())
  8107  	}
  8108  
  8109  	msgsQ := newIPQueue[*im](s, qname)
  8110  	defer msgsQ.unregister()
  8111  
  8112  	// Send our catchup request here.
  8113  	reply := syncReplySubject()
  8114  	sub, err = s.sysSubscribe(reply, func(_ *subscription, _ *client, _ *Account, _, reply string, msg []byte) {
  8115  		// Make copy since we are using a buffer from the inbound client/route.
  8116  		msgsQ.push(&im{copyBytes(msg), reply})
  8117  	})
  8118  	if err != nil {
  8119  		s.Errorf("Could not subscribe to stream catchup: %v", err)
  8120  		goto RETRY
  8121  	}
  8122  
  8123  	// Send our sync request.
  8124  	b, _ := json.Marshal(sreq)
  8125  	s.sendInternalMsgLocked(subject, reply, nil, b)
  8126  	// Remember when we sent this out to avoid loop spins on errors below.
  8127  	reqSendTime := time.Now()
  8128  	// Clear our sync request.
  8129  	sreq = nil
  8130  
  8131  	// Run our own select loop here.
  8132  	for qch, lch := n.QuitC(), n.LeadChangeC(); ; {
  8133  		select {
  8134  		case <-msgsQ.ch:
  8135  			notActive.Reset(activityInterval)
  8136  
  8137  			mrecs := msgsQ.pop()
  8138  			for _, mrec := range mrecs {
  8139  				msg := mrec.msg
  8140  				// Check for eof signaling.
  8141  				if len(msg) == 0 {
  8142  					msgsQ.recycle(&mrecs)
  8143  					checkFinalState()
  8144  					return nil
  8145  				}
  8146  				if _, err := mset.processCatchupMsg(msg); err == nil {
  8147  					if mrec.reply != _EMPTY_ {
  8148  						s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, nil)
  8149  					}
  8150  				} else if isOutOfSpaceErr(err) {
  8151  					notifyLeaderStopCatchup(mrec, err)
  8152  					return err
  8153  				} else if err == NewJSInsufficientResourcesError() {
  8154  					notifyLeaderStopCatchup(mrec, err)
  8155  					if mset.js.limitsExceeded(mset.cfg.Storage) {
  8156  						s.resourcesExceededError()
  8157  					} else {
  8158  						s.Warnf("Catchup for stream '%s > %s' errored, account resources exceeded: %v", mset.account(), mset.name(), err)
  8159  					}
  8160  					msgsQ.recycle(&mrecs)
  8161  					return err
  8162  				} else {
  8163  					notifyLeaderStopCatchup(mrec, err)
  8164  					s.Warnf("Catchup for stream '%s > %s' errored, will retry: %v", mset.account(), mset.name(), err)
  8165  					msgsQ.recycle(&mrecs)
  8166  
  8167  					// Make sure we do not spin and make things worse.
  8168  					const minRetryWait = 2 * time.Second
  8169  					elapsed := time.Since(reqSendTime)
  8170  					if elapsed < minRetryWait {
  8171  						select {
  8172  						case <-s.quitCh:
  8173  							return ErrServerNotRunning
  8174  						case <-qch:
  8175  							return errCatchupStreamStopped
  8176  						case <-time.After(minRetryWait - elapsed):
  8177  						}
  8178  					}
  8179  					goto RETRY
  8180  				}
  8181  			}
  8182  			notActive.Reset(activityInterval)
  8183  			msgsQ.recycle(&mrecs)
  8184  		case <-notActive.C:
  8185  			if mrecs := msgsQ.pop(); len(mrecs) > 0 {
  8186  				mrec := mrecs[0]
  8187  				notifyLeaderStopCatchup(mrec, errCatchupStalled)
  8188  				msgsQ.recycle(&mrecs)
  8189  			}
  8190  			s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name())
  8191  			goto RETRY
  8192  		case <-s.quitCh:
  8193  			return ErrServerNotRunning
  8194  		case <-qch:
  8195  			return errCatchupStreamStopped
  8196  		case isLeader := <-lch:
  8197  			if isLeader {
  8198  				n.StepDown()
  8199  				goto RETRY
  8200  			}
  8201  		}
  8202  	}
  8203  }
  8204  
  8205  // processCatchupMsg will be called to process out of band catchup msgs from a sync request.
  8206  func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) {
  8207  	if len(msg) == 0 {
  8208  		return 0, errCatchupBadMsg
  8209  	}
  8210  	op := entryOp(msg[0])
  8211  	if op != streamMsgOp && op != compressedStreamMsgOp && op != deleteRangeOp {
  8212  		return 0, errCatchupBadMsg
  8213  	}
  8214  
  8215  	mbuf := msg[1:]
  8216  	if op == deleteRangeOp {
  8217  		dr, err := decodeDeleteRange(mbuf)
  8218  		if err != nil {
  8219  			return 0, errCatchupBadMsg
  8220  		}
  8221  		// Handle the delete range.
  8222  		// Make sure the sequences match up properly.
  8223  		mset.mu.Lock()
  8224  		if len(mset.preAcks) > 0 {
  8225  			for seq := dr.First; seq < dr.First+dr.Num; seq++ {
  8226  				mset.clearAllPreAcks(seq)
  8227  			}
  8228  		}
  8229  		if err = mset.store.SkipMsgs(dr.First, dr.Num); err != nil {
  8230  			mset.mu.Unlock()
  8231  			return 0, errCatchupWrongSeqForSkip
  8232  		}
  8233  		mset.lseq = dr.First + dr.Num - 1
  8234  		lseq := mset.lseq
  8235  		mset.mu.Unlock()
  8236  		return lseq, nil
  8237  	}
  8238  
  8239  	if op == compressedStreamMsgOp {
  8240  		var err error
  8241  		mbuf, err = s2.Decode(nil, mbuf)
  8242  		if err != nil {
  8243  			panic(err.Error())
  8244  		}
  8245  	}
  8246  
  8247  	subj, _, hdr, msg, seq, ts, err := decodeStreamMsg(mbuf)
  8248  	if err != nil {
  8249  		return 0, errCatchupBadMsg
  8250  	}
  8251  
  8252  	mset.mu.Lock()
  8253  	st := mset.cfg.Storage
  8254  	ddloaded := mset.ddloaded
  8255  	tierName := mset.tier
  8256  	replicas := mset.cfg.Replicas
  8257  
  8258  	if mset.hasAllPreAcks(seq, subj) {
  8259  		mset.clearAllPreAcks(seq)
  8260  		// Mark this to be skipped
  8261  		subj, ts = _EMPTY_, 0
  8262  	}
  8263  	mset.mu.Unlock()
  8264  
  8265  	if mset.js.limitsExceeded(st) {
  8266  		return 0, NewJSInsufficientResourcesError()
  8267  	} else if exceeded, apiErr := mset.jsa.limitsExceeded(st, tierName, replicas); apiErr != nil {
  8268  		return 0, apiErr
  8269  	} else if exceeded {
  8270  		return 0, NewJSInsufficientResourcesError()
  8271  	}
  8272  
  8273  	// Put into our store
  8274  	// Messages to be skipped have no subject or timestamp.
  8275  	// TODO(dlc) - formalize with skipMsgOp
  8276  	if subj == _EMPTY_ && ts == 0 {
  8277  		if lseq := mset.store.SkipMsg(); lseq != seq {
  8278  			return 0, errCatchupWrongSeqForSkip
  8279  		}
  8280  	} else if err := mset.store.StoreRawMsg(subj, hdr, msg, seq, ts); err != nil {
  8281  		return 0, err
  8282  	}
  8283  
  8284  	// Update our lseq.
  8285  	mset.setLastSeq(seq)
  8286  
  8287  	// Check for MsgId and if we have one here make sure to update our internal map.
  8288  	if len(hdr) > 0 {
  8289  		if msgId := getMsgId(hdr); msgId != _EMPTY_ {
  8290  			if !ddloaded {
  8291  				mset.mu.Lock()
  8292  				mset.rebuildDedupe()
  8293  				mset.mu.Unlock()
  8294  			}
  8295  			mset.storeMsgId(&ddentry{msgId, seq, ts})
  8296  		}
  8297  	}
  8298  
  8299  	return seq, nil
  8300  }
  8301  
  8302  func (mset *stream) handleClusterSyncRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  8303  	var sreq streamSyncRequest
  8304  	if err := json.Unmarshal(msg, &sreq); err != nil {
  8305  		// Log error.
  8306  		return
  8307  	}
  8308  	mset.srv.startGoRoutine(func() { mset.runCatchup(reply, &sreq) })
  8309  }
  8310  
  8311  // Lock should be held.
  8312  func (js *jetStream) offlineClusterInfo(rg *raftGroup) *ClusterInfo {
  8313  	s := js.srv
  8314  
  8315  	ci := &ClusterInfo{Name: s.ClusterName(), RaftGroup: rg.Name}
  8316  	for _, peer := range rg.Peers {
  8317  		if sir, ok := s.nodeToInfo.Load(peer); ok && sir != nil {
  8318  			si := sir.(nodeInfo)
  8319  			pi := &PeerInfo{Peer: peer, Name: si.name, Current: false, Offline: true}
  8320  			ci.Replicas = append(ci.Replicas, pi)
  8321  		}
  8322  	}
  8323  	return ci
  8324  }
  8325  
  8326  // clusterInfo will report on the status of the raft group.
  8327  func (js *jetStream) clusterInfo(rg *raftGroup) *ClusterInfo {
  8328  	if js == nil {
  8329  		return nil
  8330  	}
  8331  	js.mu.RLock()
  8332  	defer js.mu.RUnlock()
  8333  
  8334  	s := js.srv
  8335  	if rg == nil || rg.node == nil {
  8336  		return &ClusterInfo{
  8337  			Name:   s.cachedClusterName(),
  8338  			Leader: s.Name(),
  8339  		}
  8340  	}
  8341  
  8342  	n := rg.node
  8343  	ci := &ClusterInfo{
  8344  		Name:      s.cachedClusterName(),
  8345  		Leader:    s.serverNameForNode(n.GroupLeader()),
  8346  		RaftGroup: rg.Name,
  8347  	}
  8348  
  8349  	now := time.Now()
  8350  	id, peers := n.ID(), n.Peers()
  8351  
  8352  	// If we are leaderless, do not suppress putting us in the peer list.
  8353  	if ci.Leader == _EMPTY_ {
  8354  		id = _EMPTY_
  8355  	}
  8356  
  8357  	for _, rp := range peers {
  8358  		if rp.ID != id && rg.isMember(rp.ID) {
  8359  			var lastSeen time.Duration
  8360  			if now.After(rp.Last) && rp.Last.Unix() != 0 {
  8361  				lastSeen = now.Sub(rp.Last)
  8362  			}
  8363  			current := rp.Current
  8364  			if current && lastSeen > lostQuorumInterval {
  8365  				current = false
  8366  			}
  8367  			// Create a peer info with common settings if the peer has not been seen
  8368  			// yet (which can happen after the whole cluster is stopped and only some
  8369  			// of the nodes are restarted).
  8370  			pi := &PeerInfo{
  8371  				Current: current,
  8372  				Offline: true,
  8373  				Active:  lastSeen,
  8374  				Lag:     rp.Lag,
  8375  				Peer:    rp.ID,
  8376  			}
  8377  			// If node is found, complete/update the settings.
  8378  			if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil {
  8379  				si := sir.(nodeInfo)
  8380  				pi.Name, pi.Offline, pi.cluster = si.name, si.offline, si.cluster
  8381  			} else {
  8382  				// If not, then add a name that indicates that the server name
  8383  				// is unknown at this time, and clear the lag since it is misleading
  8384  				// (the node may not have that much lag).
  8385  				// Note: We return now the Peer ID in PeerInfo, so the "(peerID: %s)"
  8386  				// would technically not be required, but keeping it for now.
  8387  				pi.Name, pi.Lag = fmt.Sprintf("Server name unknown at this time (peerID: %s)", rp.ID), 0
  8388  			}
  8389  			ci.Replicas = append(ci.Replicas, pi)
  8390  		}
  8391  	}
  8392  	// Order the result based on the name so that we get something consistent
  8393  	// when doing repeated stream info in the CLI, etc...
  8394  	sort.Slice(ci.Replicas, func(i, j int) bool {
  8395  		return ci.Replicas[i].Name < ci.Replicas[j].Name
  8396  	})
  8397  	return ci
  8398  }
  8399  
  8400  func (mset *stream) checkClusterInfo(ci *ClusterInfo) {
  8401  	for _, r := range ci.Replicas {
  8402  		peer := getHash(r.Name)
  8403  		if lag := mset.lagForCatchupPeer(peer); lag > 0 {
  8404  			r.Current = false
  8405  			r.Lag = lag
  8406  		}
  8407  	}
  8408  }
  8409  
  8410  // Return a list of alternates, ranked by preference order to the request, of stream mirrors.
  8411  // This allows clients to select or get more information about read replicas that could be a
  8412  // better option to connect to versus the original source.
  8413  func (js *jetStream) streamAlternates(ci *ClientInfo, stream string) []StreamAlternate {
  8414  	if js == nil {
  8415  		return nil
  8416  	}
  8417  
  8418  	js.mu.RLock()
  8419  	defer js.mu.RUnlock()
  8420  
  8421  	s, cc := js.srv, js.cluster
  8422  	// Track our domain.
  8423  	domain := s.getOpts().JetStreamDomain
  8424  
  8425  	// No clustering just return nil.
  8426  	if cc == nil {
  8427  		return nil
  8428  	}
  8429  	acc, _ := s.LookupAccount(ci.serviceAccount())
  8430  	if acc == nil {
  8431  		return nil
  8432  	}
  8433  
  8434  	// Collect our ordering first for clusters.
  8435  	weights := make(map[string]int)
  8436  	all := []string{ci.Cluster}
  8437  	all = append(all, ci.Alternates...)
  8438  
  8439  	for i := 0; i < len(all); i++ {
  8440  		weights[all[i]] = len(all) - i
  8441  	}
  8442  
  8443  	var alts []StreamAlternate
  8444  	for _, sa := range cc.streams[acc.Name] {
  8445  		// Add in ourselves and any mirrors.
  8446  		if sa.Config.Name == stream || (sa.Config.Mirror != nil && sa.Config.Mirror.Name == stream) {
  8447  			alts = append(alts, StreamAlternate{Name: sa.Config.Name, Domain: domain, Cluster: sa.Group.Cluster})
  8448  		}
  8449  	}
  8450  	// If just us don't fill in.
  8451  	if len(alts) == 1 {
  8452  		return nil
  8453  	}
  8454  
  8455  	// Sort based on our weights that originate from the request itself.
  8456  	sort.Slice(alts, func(i, j int) bool {
  8457  		return weights[alts[i].Cluster] > weights[alts[j].Cluster]
  8458  	})
  8459  
  8460  	return alts
  8461  }
  8462  
  8463  // Internal request for stream info, this is coming on the wire so do not block here.
  8464  func (mset *stream) handleClusterStreamInfoRequest(_ *subscription, c *client, _ *Account, subject, reply string, _ []byte) {
  8465  	go mset.processClusterStreamInfoRequest(reply)
  8466  }
  8467  
  8468  func (mset *stream) processClusterStreamInfoRequest(reply string) {
  8469  	mset.mu.RLock()
  8470  	sysc, js, sa, config := mset.sysc, mset.srv.js.Load(), mset.sa, mset.cfg
  8471  	isLeader := mset.isLeader()
  8472  	mset.mu.RUnlock()
  8473  
  8474  	// By design all members will receive this. Normally we only want the leader answering.
  8475  	// But if we have stalled and lost quorom all can respond.
  8476  	if sa != nil && !js.isGroupLeaderless(sa.Group) && !isLeader {
  8477  		return
  8478  	}
  8479  
  8480  	// If we are not the leader let someone else possibly respond first.
  8481  	if !isLeader {
  8482  		time.Sleep(500 * time.Millisecond)
  8483  	}
  8484  
  8485  	si := &StreamInfo{
  8486  		Created:   mset.createdTime(),
  8487  		State:     mset.state(),
  8488  		Config:    config,
  8489  		Cluster:   js.clusterInfo(mset.raftGroup()),
  8490  		Sources:   mset.sourcesInfo(),
  8491  		Mirror:    mset.mirrorInfo(),
  8492  		TimeStamp: time.Now().UTC(),
  8493  	}
  8494  
  8495  	// Check for out of band catchups.
  8496  	if mset.hasCatchupPeers() {
  8497  		mset.checkClusterInfo(si.Cluster)
  8498  	}
  8499  
  8500  	sysc.sendInternalMsg(reply, _EMPTY_, nil, si)
  8501  }
  8502  
  8503  // 64MB for now, for the total server. This is max we will blast out if asked to
  8504  // do so to another server for purposes of catchups.
  8505  // This number should be ok on 1Gbit interface.
  8506  const defaultMaxTotalCatchupOutBytes = int64(64 * 1024 * 1024)
  8507  
  8508  // Current total outstanding catchup bytes.
  8509  func (s *Server) gcbTotal() int64 {
  8510  	s.gcbMu.RLock()
  8511  	defer s.gcbMu.RUnlock()
  8512  	return s.gcbOut
  8513  }
  8514  
  8515  // Returns true if Current total outstanding catchup bytes is below
  8516  // the maximum configured.
  8517  func (s *Server) gcbBelowMax() bool {
  8518  	s.gcbMu.RLock()
  8519  	defer s.gcbMu.RUnlock()
  8520  	return s.gcbOut <= s.gcbOutMax
  8521  }
  8522  
  8523  // Adds `sz` to the server's total outstanding catchup bytes and to `localsz`
  8524  // under the gcbMu lock. The `localsz` points to the local outstanding catchup
  8525  // bytes of the runCatchup go routine of a given stream.
  8526  func (s *Server) gcbAdd(localsz *int64, sz int64) {
  8527  	s.gcbMu.Lock()
  8528  	atomic.AddInt64(localsz, sz)
  8529  	s.gcbOut += sz
  8530  	if s.gcbOut >= s.gcbOutMax && s.gcbKick == nil {
  8531  		s.gcbKick = make(chan struct{})
  8532  	}
  8533  	s.gcbMu.Unlock()
  8534  }
  8535  
  8536  // Removes `sz` from the server's total outstanding catchup bytes and from
  8537  // `localsz`, but only if `localsz` is non 0, which would signal that gcSubLast
  8538  // has already been invoked. See that function for details.
  8539  // Must be invoked under the gcbMu lock.
  8540  func (s *Server) gcbSubLocked(localsz *int64, sz int64) {
  8541  	if atomic.LoadInt64(localsz) == 0 {
  8542  		return
  8543  	}
  8544  	atomic.AddInt64(localsz, -sz)
  8545  	s.gcbOut -= sz
  8546  	if s.gcbKick != nil && s.gcbOut < s.gcbOutMax {
  8547  		close(s.gcbKick)
  8548  		s.gcbKick = nil
  8549  	}
  8550  }
  8551  
  8552  // Locked version of gcbSubLocked()
  8553  func (s *Server) gcbSub(localsz *int64, sz int64) {
  8554  	s.gcbMu.Lock()
  8555  	s.gcbSubLocked(localsz, sz)
  8556  	s.gcbMu.Unlock()
  8557  }
  8558  
  8559  // Similar to gcbSub() but reset `localsz` to 0 at the end under the gcbMu lock.
  8560  // This will signal further calls to gcbSub() for this `localsz` pointer that
  8561  // nothing should be done because runCatchup() has exited and any remaining
  8562  // outstanding bytes value has already been decremented.
  8563  func (s *Server) gcbSubLast(localsz *int64) {
  8564  	s.gcbMu.Lock()
  8565  	s.gcbSubLocked(localsz, *localsz)
  8566  	*localsz = 0
  8567  	s.gcbMu.Unlock()
  8568  }
  8569  
  8570  // Returns our kick chan, or nil if it does not exist.
  8571  func (s *Server) cbKickChan() <-chan struct{} {
  8572  	s.gcbMu.RLock()
  8573  	defer s.gcbMu.RUnlock()
  8574  	return s.gcbKick
  8575  }
  8576  
  8577  func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) {
  8578  	s := mset.srv
  8579  	defer s.grWG.Done()
  8580  
  8581  	const maxOutBytes = int64(64 * 1024 * 1024) // 64MB for now, these are all internal, from server to server
  8582  	const maxOutMsgs = int32(256 * 1024)        // 256k in case we have lots of small messages or skip msgs.
  8583  	outb := int64(0)
  8584  	outm := int32(0)
  8585  
  8586  	// On abnormal exit make sure to update global total.
  8587  	defer s.gcbSubLast(&outb)
  8588  
  8589  	// Flow control processing.
  8590  	ackReplySize := func(subj string) int64 {
  8591  		if li := strings.LastIndexByte(subj, btsep); li > 0 && li < len(subj) {
  8592  			return parseAckReplyNum(subj[li+1:])
  8593  		}
  8594  		return 0
  8595  	}
  8596  
  8597  	nextBatchC := make(chan struct{}, 1)
  8598  	nextBatchC <- struct{}{}
  8599  	remoteQuitCh := make(chan struct{})
  8600  
  8601  	const activityInterval = 30 * time.Second
  8602  	notActive := time.NewTimer(activityInterval)
  8603  	defer notActive.Stop()
  8604  
  8605  	// Setup ackReply for flow control.
  8606  	ackReply := syncAckSubject()
  8607  	ackSub, _ := s.sysSubscribe(ackReply, func(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  8608  		if len(msg) > 0 {
  8609  			s.Warnf("Catchup for stream '%s > %s' was aborted on the remote due to: %q",
  8610  				mset.account(), mset.name(), msg)
  8611  			s.sysUnsubscribe(sub)
  8612  			close(remoteQuitCh)
  8613  			return
  8614  		}
  8615  		sz := ackReplySize(subject)
  8616  		s.gcbSub(&outb, sz)
  8617  		atomic.AddInt32(&outm, -1)
  8618  		mset.updateCatchupPeer(sreq.Peer)
  8619  		// Kick ourselves and anyone else who might have stalled on global state.
  8620  		select {
  8621  		case nextBatchC <- struct{}{}:
  8622  			// Reset our activity
  8623  			notActive.Reset(activityInterval)
  8624  		default:
  8625  		}
  8626  	})
  8627  	defer s.sysUnsubscribe(ackSub)
  8628  	ackReplyT := strings.ReplaceAll(ackReply, ".*", ".%d")
  8629  
  8630  	// Grab our state.
  8631  	var state StreamState
  8632  	mset.mu.RLock()
  8633  	mset.store.FastState(&state)
  8634  	mset.mu.RUnlock()
  8635  
  8636  	// Reset notion of first if this request wants sequences before our starting sequence
  8637  	// and we would have nothing to send. If we have partial messages still need to send skips for those.
  8638  	// We will keep sreq's first sequence to not create sequence mismatches on the follower, but we extend the last to our current state.
  8639  	if sreq.FirstSeq < state.FirstSeq && state.FirstSeq > sreq.LastSeq {
  8640  		s.Debugf("Catchup for stream '%s > %s' resetting request first sequence from %d to %d",
  8641  			mset.account(), mset.name(), sreq.FirstSeq, state.FirstSeq)
  8642  		if state.LastSeq > sreq.LastSeq {
  8643  			sreq.LastSeq = state.LastSeq
  8644  		}
  8645  	}
  8646  
  8647  	// Setup sequences to walk through.
  8648  	seq, last := sreq.FirstSeq, sreq.LastSeq
  8649  	mset.setCatchupPeer(sreq.Peer, last-seq)
  8650  
  8651  	// Check if we can compress during this.
  8652  	compressOk := mset.compressAllowed()
  8653  
  8654  	var spb int
  8655  	const minWait = 5 * time.Second
  8656  
  8657  	sendNextBatchAndContinue := func(qch chan struct{}) bool {
  8658  		// Check if we know we will not enter the loop because we are done.
  8659  		if seq > last {
  8660  			s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name())
  8661  			// EOF
  8662  			s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil)
  8663  			return false
  8664  		}
  8665  
  8666  		// If we already sent a batch, we will try to make sure we can at least send a minimum
  8667  		// batch before sending the next batch.
  8668  		if spb > 0 {
  8669  			// Wait til we can send at least 4k
  8670  			const minBatchWait = int32(4 * 1024)
  8671  			mw := time.NewTimer(minWait)
  8672  			for done := false; !done; {
  8673  				select {
  8674  				case <-nextBatchC:
  8675  					done = maxOutMsgs-atomic.LoadInt32(&outm) > minBatchWait
  8676  					if !done {
  8677  						// Wait for a small bit.
  8678  						time.Sleep(50 * time.Millisecond)
  8679  					} else {
  8680  						// GC friendly.
  8681  						mw.Stop()
  8682  					}
  8683  				case <-mw.C:
  8684  					done = true
  8685  				case <-s.quitCh:
  8686  					return false
  8687  				case <-qch:
  8688  					return false
  8689  				case <-remoteQuitCh:
  8690  					return false
  8691  				}
  8692  			}
  8693  			spb = 0
  8694  		}
  8695  
  8696  		// Send an encoded msg.
  8697  		sendEM := func(em []byte) {
  8698  			// Place size in reply subject for flow control.
  8699  			l := int64(len(em))
  8700  			reply := fmt.Sprintf(ackReplyT, l)
  8701  			s.gcbAdd(&outb, l)
  8702  			atomic.AddInt32(&outm, 1)
  8703  			s.sendInternalMsgLocked(sendSubject, reply, nil, em)
  8704  			spb++
  8705  		}
  8706  
  8707  		// If we support gap markers.
  8708  		var dr DeleteRange
  8709  		drOk := sreq.DeleteRangesOk
  8710  
  8711  		// Will send our delete range.
  8712  		// Should already be checked for being valid.
  8713  		sendDR := func() {
  8714  			if dr.Num == 1 {
  8715  				// Send like a normal skip msg.
  8716  				sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, dr.First, 0))
  8717  			} else {
  8718  				// We have a run, send a gap record. We send these without reply or tracking.
  8719  				s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, encodeDeleteRange(&dr))
  8720  				// Clear out the pending for catchup.
  8721  				mset.decrementCatchupPeer(sreq.Peer, dr.Num)
  8722  			}
  8723  			// Reset always.
  8724  			dr.First, dr.Num = 0, 0
  8725  		}
  8726  
  8727  		var smv StoreMsg
  8728  		for ; seq <= last && atomic.LoadInt64(&outb) <= maxOutBytes && atomic.LoadInt32(&outm) <= maxOutMsgs && s.gcbBelowMax(); seq++ {
  8729  			sm, err := mset.store.LoadMsg(seq, &smv)
  8730  			// if this is not a deleted msg, bail out.
  8731  			if err != nil && err != ErrStoreMsgNotFound && err != errDeletedMsg {
  8732  				if err == ErrStoreEOF {
  8733  					var state StreamState
  8734  					mset.store.FastState(&state)
  8735  					if seq > state.LastSeq {
  8736  						// The snapshot has a larger last sequence then we have. This could be due to a truncation
  8737  						// when trying to recover after corruption, still not 100% sure. Could be off by 1 too somehow,
  8738  						// but tested a ton of those with no success.
  8739  						s.Warnf("Catchup for stream '%s > %s' completed, but requested sequence %d was larger then current state: %+v",
  8740  							mset.account(), mset.name(), seq, state)
  8741  						// Try our best to redo our invalidated snapshot as well.
  8742  						if n := mset.raftNode(); n != nil {
  8743  							n.InstallSnapshot(mset.stateSnapshot())
  8744  						}
  8745  						// Signal EOF
  8746  						s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil)
  8747  						return false
  8748  					}
  8749  				}
  8750  				s.Warnf("Error loading message for catchup '%s > %s': %v", mset.account(), mset.name(), err)
  8751  				return false
  8752  			}
  8753  
  8754  			if sm != nil {
  8755  				// If we allow gap markers check if we have one pending.
  8756  				if drOk && dr.First > 0 {
  8757  					sendDR()
  8758  				}
  8759  				// Send the normal message now.
  8760  				sendEM(encodeStreamMsgAllowCompress(sm.subj, _EMPTY_, sm.hdr, sm.msg, sm.seq, sm.ts, compressOk))
  8761  			} else {
  8762  				if drOk {
  8763  					if dr.First == 0 {
  8764  						dr.First, dr.Num = seq, 1
  8765  					} else {
  8766  						dr.Num++
  8767  					}
  8768  				} else {
  8769  					// Skip record for deleted msg.
  8770  					sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq, 0))
  8771  				}
  8772  			}
  8773  
  8774  			// Check if we are done.
  8775  			if seq == last {
  8776  				// Need to see if we have a pending delete range.
  8777  				if drOk && dr.First > 0 {
  8778  					sendDR()
  8779  				}
  8780  				// Check for a condition where our state's first is now past the last that we could have sent.
  8781  				// If so reset last and continue sending.
  8782  				var state StreamState
  8783  				mset.mu.RLock()
  8784  				mset.store.FastState(&state)
  8785  				mset.mu.RUnlock()
  8786  				if last < state.FirstSeq {
  8787  					last = state.LastSeq
  8788  				}
  8789  				// Recheck our exit condition.
  8790  				if seq == last {
  8791  					s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name())
  8792  					// EOF
  8793  					s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil)
  8794  					return false
  8795  				}
  8796  			}
  8797  			select {
  8798  			case <-remoteQuitCh:
  8799  				return false
  8800  			default:
  8801  			}
  8802  		}
  8803  		if drOk && dr.First > 0 {
  8804  			sendDR()
  8805  		}
  8806  
  8807  		return true
  8808  	}
  8809  
  8810  	// Check is this stream got closed.
  8811  	mset.mu.RLock()
  8812  	qch := mset.qch
  8813  	mset.mu.RUnlock()
  8814  	if qch == nil {
  8815  		return
  8816  	}
  8817  
  8818  	// Run as long as we are still active and need catchup.
  8819  	// FIXME(dlc) - Purge event? Stream delete?
  8820  	for {
  8821  		// Get this each time, will be non-nil if globally blocked and we will close to wake everyone up.
  8822  		cbKick := s.cbKickChan()
  8823  
  8824  		select {
  8825  		case <-s.quitCh:
  8826  			return
  8827  		case <-qch:
  8828  			return
  8829  		case <-remoteQuitCh:
  8830  			mset.clearCatchupPeer(sreq.Peer)
  8831  			return
  8832  		case <-notActive.C:
  8833  			s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name())
  8834  			mset.clearCatchupPeer(sreq.Peer)
  8835  			return
  8836  		case <-nextBatchC:
  8837  			if !sendNextBatchAndContinue(qch) {
  8838  				mset.clearCatchupPeer(sreq.Peer)
  8839  				return
  8840  			}
  8841  		case <-cbKick:
  8842  			if !sendNextBatchAndContinue(qch) {
  8843  				mset.clearCatchupPeer(sreq.Peer)
  8844  				return
  8845  			}
  8846  		}
  8847  	}
  8848  }
  8849  
  8850  const jscAllSubj = "$JSC.>"
  8851  
  8852  func syncSubjForStream() string {
  8853  	return syncSubject("$JSC.SYNC")
  8854  }
  8855  
  8856  func syncReplySubject() string {
  8857  	return syncSubject("$JSC.R")
  8858  }
  8859  
  8860  func infoReplySubject() string {
  8861  	return syncSubject("$JSC.R")
  8862  }
  8863  
  8864  func syncAckSubject() string {
  8865  	return syncSubject("$JSC.ACK") + ".*"
  8866  }
  8867  
  8868  func syncSubject(pre string) string {
  8869  	var sb strings.Builder
  8870  	sb.WriteString(pre)
  8871  	sb.WriteByte(btsep)
  8872  
  8873  	var b [replySuffixLen]byte
  8874  	rn := rand.Int63()
  8875  	for i, l := 0, rn; i < len(b); i++ {
  8876  		b[i] = digits[l%base]
  8877  		l /= base
  8878  	}
  8879  
  8880  	sb.Write(b[:])
  8881  	return sb.String()
  8882  }
  8883  
  8884  const (
  8885  	clusterStreamInfoT   = "$JSC.SI.%s.%s"
  8886  	clusterConsumerInfoT = "$JSC.CI.%s.%s.%s"
  8887  	jsaUpdatesSubT       = "$JSC.ARU.%s.*"
  8888  	jsaUpdatesPubT       = "$JSC.ARU.%s.%s"
  8889  )