github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/jetstream_cluster.go (about)

     1  // Copyright 2020-2024 The NATS Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package server
    15  
    16  import (
    17  	"bytes"
    18  	crand "crypto/rand"
    19  	"encoding/binary"
    20  	"encoding/json"
    21  	"errors"
    22  	"fmt"
    23  	"math"
    24  	"math/rand"
    25  	"os"
    26  	"path/filepath"
    27  	"reflect"
    28  	"sort"
    29  	"strconv"
    30  	"strings"
    31  	"sync/atomic"
    32  	"time"
    33  
    34  	"github.com/klauspost/compress/s2"
    35  	"github.com/minio/highwayhash"
    36  	"github.com/nats-io/nuid"
    37  )
    38  
    39  // jetStreamCluster holds information about the meta group and stream assignments.
    40  type jetStreamCluster struct {
    41  	// The metacontroller raftNode.
    42  	meta RaftNode
    43  	// For stream and consumer assignments. All servers will have this be the same.
    44  	// ACCOUNT -> STREAM -> Stream Assignment -> Consumers
    45  	streams map[string]map[string]*streamAssignment
    46  	// These are inflight proposals and used to apply limits when there are
    47  	// concurrent requests that would otherwise be accepted.
    48  	// We also record the group for the stream. This is needed since if we have
    49  	// concurrent requests for same account and stream we need to let it process to get
    50  	// a response but they need to be same group, peers etc.
    51  	inflight map[string]map[string]*raftGroup
    52  	// Signals meta-leader should check the stream assignments.
    53  	streamsCheck bool
    54  	// Server.
    55  	s *Server
    56  	// Internal client.
    57  	c *client
    58  	// Processing assignment results.
    59  	streamResults   *subscription
    60  	consumerResults *subscription
    61  	// System level request to have the leader stepdown.
    62  	stepdown *subscription
    63  	// System level requests to remove a peer.
    64  	peerRemove *subscription
    65  	// System level request to move a stream
    66  	peerStreamMove *subscription
    67  	// System level request to cancel a stream move
    68  	peerStreamCancelMove *subscription
    69  	// To pop out the monitorCluster before the raft layer.
    70  	qch chan struct{}
    71  }
    72  
    73  // Used to guide placement of streams and meta controllers in clustered JetStream.
    74  type Placement struct {
    75  	Cluster string   `json:"cluster,omitempty"`
    76  	Tags    []string `json:"tags,omitempty"`
    77  }
    78  
    79  // Define types of the entry.
    80  type entryOp uint8
    81  
    82  // ONLY ADD TO THE END, DO NOT INSERT IN BETWEEN WILL BREAK SERVER INTEROP.
    83  const (
    84  	// Meta ops.
    85  	assignStreamOp entryOp = iota
    86  	assignConsumerOp
    87  	removeStreamOp
    88  	removeConsumerOp
    89  	// Stream ops.
    90  	streamMsgOp
    91  	purgeStreamOp
    92  	deleteMsgOp
    93  	// Consumer ops.
    94  	updateDeliveredOp
    95  	updateAcksOp
    96  	// Compressed consumer assignments.
    97  	assignCompressedConsumerOp
    98  	// Filtered Consumer skip.
    99  	updateSkipOp
   100  	// Update Stream.
   101  	updateStreamOp
   102  	// For updating information on pending pull requests.
   103  	addPendingRequest
   104  	removePendingRequest
   105  	// For sending compressed streams, either through RAFT or catchup.
   106  	compressedStreamMsgOp
   107  	// For sending deleted gaps on catchups for replicas.
   108  	deleteRangeOp
   109  )
   110  
   111  // raftGroups are controlled by the metagroup controller.
   112  // The raftGroups will house streams and consumers.
   113  type raftGroup struct {
   114  	Name      string      `json:"name"`
   115  	Peers     []string    `json:"peers"`
   116  	Storage   StorageType `json:"store"`
   117  	Cluster   string      `json:"cluster,omitempty"`
   118  	Preferred string      `json:"preferred,omitempty"`
   119  	// Internal
   120  	node RaftNode
   121  }
   122  
   123  // streamAssignment is what the meta controller uses to assign streams to peers.
   124  type streamAssignment struct {
   125  	Client  *ClientInfo   `json:"client,omitempty"`
   126  	Created time.Time     `json:"created"`
   127  	Config  *StreamConfig `json:"stream"`
   128  	Group   *raftGroup    `json:"group"`
   129  	Sync    string        `json:"sync"`
   130  	Subject string        `json:"subject"`
   131  	Reply   string        `json:"reply"`
   132  	Restore *StreamState  `json:"restore_state,omitempty"`
   133  	// Internal
   134  	consumers  map[string]*consumerAssignment
   135  	responded  bool
   136  	recovering bool
   137  	err        error
   138  }
   139  
   140  // consumerAssignment is what the meta controller uses to assign consumers to streams.
   141  type consumerAssignment struct {
   142  	Client  *ClientInfo     `json:"client,omitempty"`
   143  	Created time.Time       `json:"created"`
   144  	Name    string          `json:"name"`
   145  	Stream  string          `json:"stream"`
   146  	Config  *ConsumerConfig `json:"consumer"`
   147  	Group   *raftGroup      `json:"group"`
   148  	Subject string          `json:"subject"`
   149  	Reply   string          `json:"reply"`
   150  	State   *ConsumerState  `json:"state,omitempty"`
   151  	// Internal
   152  	responded  bool
   153  	recovering bool
   154  	deleted    bool
   155  	err        error
   156  }
   157  
   158  // streamPurge is what the stream leader will replicate when purging a stream.
   159  type streamPurge struct {
   160  	Client  *ClientInfo              `json:"client,omitempty"`
   161  	Stream  string                   `json:"stream"`
   162  	LastSeq uint64                   `json:"last_seq"`
   163  	Subject string                   `json:"subject"`
   164  	Reply   string                   `json:"reply"`
   165  	Request *JSApiStreamPurgeRequest `json:"request,omitempty"`
   166  }
   167  
   168  // streamMsgDelete is what the stream leader will replicate when deleting a message.
   169  type streamMsgDelete struct {
   170  	Client  *ClientInfo `json:"client,omitempty"`
   171  	Stream  string      `json:"stream"`
   172  	Seq     uint64      `json:"seq"`
   173  	NoErase bool        `json:"no_erase,omitempty"`
   174  	Subject string      `json:"subject"`
   175  	Reply   string      `json:"reply"`
   176  }
   177  
   178  const (
   179  	defaultStoreDirName  = "_js_"
   180  	defaultMetaGroupName = "_meta_"
   181  	defaultMetaFSBlkSize = 1024 * 1024
   182  	jsExcludePlacement   = "!jetstream"
   183  )
   184  
   185  // Returns information useful in mixed mode.
   186  func (s *Server) trackedJetStreamServers() (js, total int) {
   187  	s.mu.RLock()
   188  	defer s.mu.RUnlock()
   189  	if !s.isRunning() || !s.eventsEnabled() {
   190  		return -1, -1
   191  	}
   192  	s.nodeToInfo.Range(func(k, v any) bool {
   193  		si := v.(nodeInfo)
   194  		if si.js {
   195  			js++
   196  		}
   197  		total++
   198  		return true
   199  	})
   200  	return js, total
   201  }
   202  
   203  func (s *Server) getJetStreamCluster() (*jetStream, *jetStreamCluster) {
   204  	if s.isShuttingDown() {
   205  		return nil, nil
   206  	}
   207  
   208  	js := s.getJetStream()
   209  	if js == nil {
   210  		return nil, nil
   211  	}
   212  
   213  	// Only set once, do not need a lock.
   214  	return js, js.cluster
   215  }
   216  
   217  func (s *Server) JetStreamIsClustered() bool {
   218  	js := s.getJetStream()
   219  	if js == nil {
   220  		return false
   221  	}
   222  	return js.isClustered()
   223  }
   224  
   225  func (s *Server) JetStreamIsLeader() bool {
   226  	return s.isMetaLeader.Load()
   227  }
   228  
   229  func (s *Server) JetStreamIsCurrent() bool {
   230  	js := s.getJetStream()
   231  	if js == nil {
   232  		return false
   233  	}
   234  	// Grab what we need and release js lock.
   235  	js.mu.RLock()
   236  	var meta RaftNode
   237  	cc := js.cluster
   238  	if cc != nil {
   239  		meta = cc.meta
   240  	}
   241  	js.mu.RUnlock()
   242  
   243  	if cc == nil {
   244  		// Non-clustered mode
   245  		return true
   246  	}
   247  	return meta.Current()
   248  }
   249  
   250  func (s *Server) JetStreamSnapshotMeta() error {
   251  	js := s.getJetStream()
   252  	if js == nil {
   253  		return NewJSNotEnabledError()
   254  	}
   255  	js.mu.RLock()
   256  	cc := js.cluster
   257  	isLeader := cc.isLeader()
   258  	meta := cc.meta
   259  	js.mu.RUnlock()
   260  
   261  	if !isLeader {
   262  		return errNotLeader
   263  	}
   264  
   265  	return meta.InstallSnapshot(js.metaSnapshot())
   266  }
   267  
   268  func (s *Server) JetStreamStepdownStream(account, stream string) error {
   269  	js, cc := s.getJetStreamCluster()
   270  	if js == nil {
   271  		return NewJSNotEnabledError()
   272  	}
   273  	if cc == nil {
   274  		return NewJSClusterNotActiveError()
   275  	}
   276  	// Grab account
   277  	acc, err := s.LookupAccount(account)
   278  	if err != nil {
   279  		return err
   280  	}
   281  	// Grab stream
   282  	mset, err := acc.lookupStream(stream)
   283  	if err != nil {
   284  		return err
   285  	}
   286  
   287  	if node := mset.raftNode(); node != nil && node.Leader() {
   288  		node.StepDown()
   289  	}
   290  
   291  	return nil
   292  }
   293  
   294  func (s *Server) JetStreamStepdownConsumer(account, stream, consumer string) error {
   295  	js, cc := s.getJetStreamCluster()
   296  	if js == nil {
   297  		return NewJSNotEnabledError()
   298  	}
   299  	if cc == nil {
   300  		return NewJSClusterNotActiveError()
   301  	}
   302  	// Grab account
   303  	acc, err := s.LookupAccount(account)
   304  	if err != nil {
   305  		return err
   306  	}
   307  	// Grab stream
   308  	mset, err := acc.lookupStream(stream)
   309  	if err != nil {
   310  		return err
   311  	}
   312  
   313  	o := mset.lookupConsumer(consumer)
   314  	if o == nil {
   315  		return NewJSConsumerNotFoundError()
   316  	}
   317  
   318  	if node := o.raftNode(); node != nil && node.Leader() {
   319  		node.StepDown()
   320  	}
   321  
   322  	return nil
   323  }
   324  
   325  func (s *Server) JetStreamSnapshotStream(account, stream string) error {
   326  	js, cc := s.getJetStreamCluster()
   327  	if js == nil {
   328  		return NewJSNotEnabledForAccountError()
   329  	}
   330  	if cc == nil {
   331  		return NewJSClusterNotActiveError()
   332  	}
   333  	// Grab account
   334  	acc, err := s.LookupAccount(account)
   335  	if err != nil {
   336  		return err
   337  	}
   338  	// Grab stream
   339  	mset, err := acc.lookupStream(stream)
   340  	if err != nil {
   341  		return err
   342  	}
   343  
   344  	// Hold lock when installing snapshot.
   345  	mset.mu.Lock()
   346  	if mset.node == nil {
   347  		mset.mu.Unlock()
   348  		return nil
   349  	}
   350  	err = mset.node.InstallSnapshot(mset.stateSnapshotLocked())
   351  	mset.mu.Unlock()
   352  
   353  	return err
   354  }
   355  
   356  func (s *Server) JetStreamClusterPeers() []string {
   357  	js := s.getJetStream()
   358  	if js == nil {
   359  		return nil
   360  	}
   361  	js.mu.RLock()
   362  	defer js.mu.RUnlock()
   363  
   364  	cc := js.cluster
   365  	if !cc.isLeader() || cc.meta == nil {
   366  		return nil
   367  	}
   368  	peers := cc.meta.Peers()
   369  	var nodes []string
   370  	for _, p := range peers {
   371  		si, ok := s.nodeToInfo.Load(p.ID)
   372  		if !ok || si == nil {
   373  			continue
   374  		}
   375  		ni := si.(nodeInfo)
   376  		// Ignore if offline, no JS, or no current stats have been received.
   377  		if ni.offline || !ni.js || ni.stats == nil {
   378  			continue
   379  		}
   380  		nodes = append(nodes, si.(nodeInfo).name)
   381  	}
   382  	return nodes
   383  }
   384  
   385  // Read lock should be held.
   386  func (cc *jetStreamCluster) isLeader() bool {
   387  	if cc == nil {
   388  		// Non-clustered mode
   389  		return true
   390  	}
   391  	return cc.meta != nil && cc.meta.Leader()
   392  }
   393  
   394  // isStreamCurrent will determine if the stream is up to date.
   395  // For R1 it will make sure the stream is present on this server.
   396  // Read lock should be held.
   397  func (cc *jetStreamCluster) isStreamCurrent(account, stream string) bool {
   398  	if cc == nil {
   399  		// Non-clustered mode
   400  		return true
   401  	}
   402  	as := cc.streams[account]
   403  	if as == nil {
   404  		return false
   405  	}
   406  	sa := as[stream]
   407  	if sa == nil {
   408  		return false
   409  	}
   410  	rg := sa.Group
   411  	if rg == nil {
   412  		return false
   413  	}
   414  
   415  	if rg.node == nil || rg.node.Current() {
   416  		// Check if we are processing a snapshot and are catching up.
   417  		acc, err := cc.s.LookupAccount(account)
   418  		if err != nil {
   419  			return false
   420  		}
   421  		mset, err := acc.lookupStream(stream)
   422  		if err != nil {
   423  			return false
   424  		}
   425  		if mset.isCatchingUp() {
   426  			return false
   427  		}
   428  		// Success.
   429  		return true
   430  	}
   431  
   432  	return false
   433  }
   434  
   435  // Restart the stream in question.
   436  // Should only be called when the stream is known to be in a bad state.
   437  func (js *jetStream) restartStream(acc *Account, csa *streamAssignment) {
   438  	js.mu.Lock()
   439  	s, cc := js.srv, js.cluster
   440  	if cc == nil {
   441  		js.mu.Unlock()
   442  		return
   443  	}
   444  	// Need to lookup the one directly from the meta layer, what we get handed is a copy if coming from isStreamHealthy.
   445  	asa := cc.streams[acc.Name]
   446  	if asa == nil {
   447  		js.mu.Unlock()
   448  		return
   449  	}
   450  	sa := asa[csa.Config.Name]
   451  	if sa == nil {
   452  		js.mu.Unlock()
   453  		return
   454  	}
   455  	// Make sure to clear out the raft node if still present in the meta layer.
   456  	if rg := sa.Group; rg != nil && rg.node != nil {
   457  		if rg.node.State() != Closed {
   458  			rg.node.Stop()
   459  		}
   460  		rg.node = nil
   461  	}
   462  	sinceCreation := time.Since(sa.Created)
   463  	js.mu.Unlock()
   464  
   465  	// Process stream assignment to recreate.
   466  	// Check that we have given system enough time to start us up.
   467  	// This will be longer than obvious, and matches consumer logic in case system very busy.
   468  	if sinceCreation < 10*time.Second {
   469  		s.Debugf("Not restarting missing stream '%s > %s', too soon since creation %v",
   470  			acc, csa.Config.Name, sinceCreation)
   471  		return
   472  	}
   473  
   474  	js.processStreamAssignment(sa)
   475  
   476  	// If we had consumers assigned to this server they will be present in the copy, csa.
   477  	// They also need to be processed. The csa consumers is a copy of only our consumers,
   478  	// those assigned to us, but the consumer assignment's there are direct from the meta
   479  	// layer to make this part much easier and avoid excessive lookups.
   480  	for _, cca := range csa.consumers {
   481  		if cca.deleted {
   482  			continue
   483  		}
   484  		// Need to look up original as well here to make sure node is nil.
   485  		js.mu.Lock()
   486  		ca := sa.consumers[cca.Name]
   487  		if ca != nil && ca.Group != nil {
   488  			// Make sure the node is stopped if still running.
   489  			if node := ca.Group.node; node != nil && node.State() != Closed {
   490  				node.Stop()
   491  			}
   492  			// Make sure node is wiped.
   493  			ca.Group.node = nil
   494  		}
   495  		js.mu.Unlock()
   496  		if ca != nil {
   497  			js.processConsumerAssignment(ca)
   498  		}
   499  	}
   500  }
   501  
   502  // isStreamHealthy will determine if the stream is up to date or very close.
   503  // For R1 it will make sure the stream is present on this server.
   504  func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool {
   505  	js.mu.RLock()
   506  	s, cc := js.srv, js.cluster
   507  	if cc == nil {
   508  		// Non-clustered mode
   509  		js.mu.RUnlock()
   510  		return true
   511  	}
   512  
   513  	// Pull the group out.
   514  	rg := sa.Group
   515  	if rg == nil {
   516  		js.mu.RUnlock()
   517  		return false
   518  	}
   519  
   520  	streamName := sa.Config.Name
   521  	node := rg.node
   522  	js.mu.RUnlock()
   523  
   524  	// First lookup stream and make sure its there.
   525  	mset, err := acc.lookupStream(streamName)
   526  	if err != nil {
   527  		js.restartStream(acc, sa)
   528  		return false
   529  	}
   530  
   531  	// If we are catching up return false.
   532  	if mset.isCatchingUp() {
   533  		return false
   534  	}
   535  
   536  	if node == nil || node.Healthy() {
   537  		// Check if we are processing a snapshot and are catching up.
   538  		if !mset.isCatchingUp() {
   539  			return true
   540  		}
   541  	} else { // node != nil
   542  		if node != mset.raftNode() {
   543  			s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName)
   544  			node.Delete()
   545  			mset.resetClusteredState(nil)
   546  		} else if node.State() == Closed {
   547  			js.restartStream(acc, sa)
   548  		}
   549  	}
   550  
   551  	return false
   552  }
   553  
   554  // isConsumerHealthy will determine if the consumer is up to date.
   555  // For R1 it will make sure the consunmer is present on this server.
   556  func (js *jetStream) isConsumerHealthy(mset *stream, consumer string, ca *consumerAssignment) bool {
   557  	if mset == nil {
   558  		return false
   559  	}
   560  
   561  	js.mu.RLock()
   562  	cc := js.cluster
   563  	if cc == nil {
   564  		// Non-clustered mode
   565  		js.mu.RUnlock()
   566  		return true
   567  	}
   568  	// These are required.
   569  	if ca == nil || ca.Group == nil {
   570  		js.mu.RUnlock()
   571  		return false
   572  	}
   573  	s := js.srv
   574  	js.mu.RUnlock()
   575  
   576  	// Capture RAFT node from assignment.
   577  	node := ca.Group.node
   578  
   579  	// When we try to restart we nil out the node if applicable
   580  	// and reprocess the consumer assignment.
   581  	restartConsumer := func() {
   582  		mset.mu.RLock()
   583  		accName, streamName := mset.acc.GetName(), mset.cfg.Name
   584  		mset.mu.RUnlock()
   585  
   586  		js.mu.Lock()
   587  		deleted := ca.deleted
   588  		// Check that we have not just been created.
   589  		if !deleted && time.Since(ca.Created) < 10*time.Second {
   590  			s.Debugf("Not restarting missing consumer '%s > %s > %s', too soon since creation %v",
   591  				accName, streamName, consumer, time.Since(ca.Created))
   592  			js.mu.Unlock()
   593  			return
   594  		}
   595  		// Make sure the node is stopped if still running.
   596  		if node != nil && node.State() != Closed {
   597  			node.Stop()
   598  		}
   599  		ca.Group.node = nil
   600  		js.mu.Unlock()
   601  		if !deleted {
   602  			js.processConsumerAssignment(ca)
   603  		}
   604  	}
   605  
   606  	// Check if not running at all.
   607  	o := mset.lookupConsumer(consumer)
   608  	if o == nil {
   609  		restartConsumer()
   610  		return false
   611  	}
   612  
   613  	// Check RAFT node state.
   614  	if node == nil || node.Healthy() {
   615  		return true
   616  	} else if node != nil {
   617  		if node != o.raftNode() {
   618  			mset.mu.RLock()
   619  			accName, streamName := mset.acc.GetName(), mset.cfg.Name
   620  			mset.mu.RUnlock()
   621  			s.Warnf("Detected consumer cluster node skew '%s > %s > %s'", accName, streamName, consumer)
   622  			node.Delete()
   623  			o.deleteWithoutAdvisory()
   624  			restartConsumer()
   625  		} else if node.State() == Closed {
   626  			// We have a consumer, and it should have a running node but it is closed.
   627  			o.stop()
   628  			restartConsumer()
   629  		}
   630  	}
   631  	return false
   632  }
   633  
   634  // subjectsOverlap checks all existing stream assignments for the account cross-cluster for subject overlap
   635  // Use only for clustered JetStream
   636  // Read lock should be held.
   637  func (jsc *jetStreamCluster) subjectsOverlap(acc string, subjects []string, osa *streamAssignment) bool {
   638  	asa := jsc.streams[acc]
   639  	for _, sa := range asa {
   640  		// can't overlap yourself, assume osa pre-checked for deep equal if passed
   641  		if osa != nil && sa == osa {
   642  			continue
   643  		}
   644  		for _, subj := range sa.Config.Subjects {
   645  			for _, tsubj := range subjects {
   646  				if SubjectsCollide(tsubj, subj) {
   647  					return true
   648  				}
   649  			}
   650  		}
   651  	}
   652  	return false
   653  }
   654  
   655  func (a *Account) getJetStreamFromAccount() (*Server, *jetStream, *jsAccount) {
   656  	a.mu.RLock()
   657  	jsa := a.js
   658  	a.mu.RUnlock()
   659  	if jsa == nil {
   660  		return nil, nil, nil
   661  	}
   662  	jsa.mu.RLock()
   663  	js := jsa.js
   664  	jsa.mu.RUnlock()
   665  	if js == nil {
   666  		return nil, nil, nil
   667  	}
   668  	// Lock not needed, set on creation.
   669  	s := js.srv
   670  	return s, js, jsa
   671  }
   672  
   673  func (s *Server) JetStreamIsStreamLeader(account, stream string) bool {
   674  	js, cc := s.getJetStreamCluster()
   675  	if js == nil || cc == nil {
   676  		return false
   677  	}
   678  	js.mu.RLock()
   679  	defer js.mu.RUnlock()
   680  	return cc.isStreamLeader(account, stream)
   681  }
   682  
   683  func (a *Account) JetStreamIsStreamLeader(stream string) bool {
   684  	s, js, jsa := a.getJetStreamFromAccount()
   685  	if s == nil || js == nil || jsa == nil {
   686  		return false
   687  	}
   688  	js.mu.RLock()
   689  	defer js.mu.RUnlock()
   690  	return js.cluster.isStreamLeader(a.Name, stream)
   691  }
   692  
   693  func (s *Server) JetStreamIsStreamCurrent(account, stream string) bool {
   694  	js, cc := s.getJetStreamCluster()
   695  	if js == nil {
   696  		return false
   697  	}
   698  	js.mu.RLock()
   699  	defer js.mu.RUnlock()
   700  	return cc.isStreamCurrent(account, stream)
   701  }
   702  
   703  func (a *Account) JetStreamIsConsumerLeader(stream, consumer string) bool {
   704  	s, js, jsa := a.getJetStreamFromAccount()
   705  	if s == nil || js == nil || jsa == nil {
   706  		return false
   707  	}
   708  	js.mu.RLock()
   709  	defer js.mu.RUnlock()
   710  	return js.cluster.isConsumerLeader(a.Name, stream, consumer)
   711  }
   712  
   713  func (s *Server) JetStreamIsConsumerLeader(account, stream, consumer string) bool {
   714  	js, cc := s.getJetStreamCluster()
   715  	if js == nil || cc == nil {
   716  		return false
   717  	}
   718  	js.mu.RLock()
   719  	defer js.mu.RUnlock()
   720  	return cc.isConsumerLeader(account, stream, consumer)
   721  }
   722  
   723  func (s *Server) enableJetStreamClustering() error {
   724  	if !s.isRunning() {
   725  		return nil
   726  	}
   727  	js := s.getJetStream()
   728  	if js == nil {
   729  		return NewJSNotEnabledForAccountError()
   730  	}
   731  	// Already set.
   732  	if js.cluster != nil {
   733  		return nil
   734  	}
   735  
   736  	s.Noticef("Starting JetStream cluster")
   737  	// We need to determine if we have a stable cluster name and expected number of servers.
   738  	s.Debugf("JetStream cluster checking for stable cluster name and peers")
   739  
   740  	hasLeafNodeSystemShare := s.canExtendOtherDomain()
   741  	if s.isClusterNameDynamic() && !hasLeafNodeSystemShare {
   742  		return errors.New("JetStream cluster requires cluster name")
   743  	}
   744  	if s.configuredRoutes() == 0 && !hasLeafNodeSystemShare {
   745  		return errors.New("JetStream cluster requires configured routes or solicited leafnode for the system account")
   746  	}
   747  
   748  	return js.setupMetaGroup()
   749  }
   750  
   751  // isClustered returns if we are clustered.
   752  // Lock should not be held.
   753  func (js *jetStream) isClustered() bool {
   754  	// This is only ever set, no need for lock here.
   755  	return js.cluster != nil
   756  }
   757  
   758  // isClusteredNoLock returns if we are clustered, but unlike isClustered() does
   759  // not use the jetstream's lock, instead, uses an atomic operation.
   760  // There are situations where some code wants to know if we are clustered but
   761  // can't use js.isClustered() without causing a lock inversion.
   762  func (js *jetStream) isClusteredNoLock() bool {
   763  	return atomic.LoadInt32(&js.clustered) == 1
   764  }
   765  
   766  func (js *jetStream) setupMetaGroup() error {
   767  	s := js.srv
   768  	s.Noticef("Creating JetStream metadata controller")
   769  
   770  	// Setup our WAL for the metagroup.
   771  	sysAcc := s.SystemAccount()
   772  	storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, defaultMetaGroupName)
   773  
   774  	fs, err := newFileStoreWithCreated(
   775  		FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMetaFSBlkSize, AsyncFlush: false, srv: s},
   776  		StreamConfig{Name: defaultMetaGroupName, Storage: FileStorage},
   777  		time.Now().UTC(),
   778  		s.jsKeyGen(s.getOpts().JetStreamKey, defaultMetaGroupName),
   779  		s.jsKeyGen(s.getOpts().JetStreamOldKey, defaultMetaGroupName),
   780  	)
   781  	if err != nil {
   782  		s.Errorf("Error creating filestore: %v", err)
   783  		return err
   784  	}
   785  
   786  	cfg := &RaftConfig{Name: defaultMetaGroupName, Store: storeDir, Log: fs}
   787  
   788  	// If we are soliciting leafnode connections and we are sharing a system account and do not disable it with a hint,
   789  	// we want to move to observer mode so that we extend the solicited cluster or supercluster but do not form our own.
   790  	cfg.Observer = s.canExtendOtherDomain() && s.getOpts().JetStreamExtHint != jsNoExtend
   791  
   792  	var bootstrap bool
   793  	if ps, err := readPeerState(storeDir); err != nil {
   794  		s.Noticef("JetStream cluster bootstrapping")
   795  		bootstrap = true
   796  		peers := s.ActivePeers()
   797  		s.Debugf("JetStream cluster initial peers: %+v", peers)
   798  		if err := s.bootstrapRaftNode(cfg, peers, false); err != nil {
   799  			return err
   800  		}
   801  		if cfg.Observer {
   802  			s.Noticef("Turning JetStream metadata controller Observer Mode on")
   803  		}
   804  	} else {
   805  		s.Noticef("JetStream cluster recovering state")
   806  		// correlate the value of observer with observations from a previous run.
   807  		if cfg.Observer {
   808  			switch ps.domainExt {
   809  			case extExtended:
   810  				s.Noticef("Keeping JetStream metadata controller Observer Mode on - due to previous contact")
   811  			case extNotExtended:
   812  				s.Noticef("Turning JetStream metadata controller Observer Mode off - due to previous contact")
   813  				cfg.Observer = false
   814  			case extUndetermined:
   815  				s.Noticef("Turning JetStream metadata controller Observer Mode on - no previous contact")
   816  				s.Noticef("In cases where JetStream will not be extended")
   817  				s.Noticef("and waiting for leader election until first contact is not acceptable,")
   818  				s.Noticef(`manually disable Observer Mode by setting the JetStream Option "extension_hint: %s"`, jsNoExtend)
   819  			}
   820  		} else {
   821  			// To track possible configuration changes, responsible for an altered value of cfg.Observer,
   822  			// set extension state to undetermined.
   823  			ps.domainExt = extUndetermined
   824  			if err := writePeerState(storeDir, ps); err != nil {
   825  				return err
   826  			}
   827  		}
   828  	}
   829  
   830  	// Start up our meta node.
   831  	n, err := s.startRaftNode(sysAcc.GetName(), cfg, pprofLabels{
   832  		"type":    "metaleader",
   833  		"account": sysAcc.Name,
   834  	})
   835  	if err != nil {
   836  		s.Warnf("Could not start metadata controller: %v", err)
   837  		return err
   838  	}
   839  
   840  	// If we are bootstrapped with no state, start campaign early.
   841  	if bootstrap {
   842  		n.Campaign()
   843  	}
   844  
   845  	c := s.createInternalJetStreamClient()
   846  	sacc := s.SystemAccount()
   847  
   848  	js.mu.Lock()
   849  	defer js.mu.Unlock()
   850  	js.cluster = &jetStreamCluster{
   851  		meta:    n,
   852  		streams: make(map[string]map[string]*streamAssignment),
   853  		s:       s,
   854  		c:       c,
   855  		qch:     make(chan struct{}),
   856  	}
   857  	atomic.StoreInt32(&js.clustered, 1)
   858  	c.registerWithAccount(sacc)
   859  
   860  	js.srv.startGoRoutine(
   861  		js.monitorCluster,
   862  		pprofLabels{
   863  			"type":    "metaleader",
   864  			"account": sacc.Name,
   865  		},
   866  	)
   867  	return nil
   868  }
   869  
   870  func (js *jetStream) getMetaGroup() RaftNode {
   871  	js.mu.RLock()
   872  	defer js.mu.RUnlock()
   873  	if js.cluster == nil {
   874  		return nil
   875  	}
   876  	return js.cluster.meta
   877  }
   878  
   879  func (js *jetStream) server() *Server {
   880  	// Lock not needed, only set once on creation.
   881  	return js.srv
   882  }
   883  
   884  // Will respond if we do not think we have a metacontroller leader.
   885  func (js *jetStream) isLeaderless() bool {
   886  	js.mu.RLock()
   887  	defer js.mu.RUnlock()
   888  
   889  	cc := js.cluster
   890  	if cc == nil || cc.meta == nil {
   891  		return false
   892  	}
   893  	// If we don't have a leader.
   894  	// Make sure we have been running for enough time.
   895  	if cc.meta.GroupLeader() == _EMPTY_ && time.Since(cc.meta.Created()) > lostQuorumIntervalDefault {
   896  		return true
   897  	}
   898  	return false
   899  }
   900  
   901  // Will respond iff we are a member and we know we have no leader.
   902  func (js *jetStream) isGroupLeaderless(rg *raftGroup) bool {
   903  	if rg == nil || js == nil {
   904  		return false
   905  	}
   906  	js.mu.RLock()
   907  	defer js.mu.RUnlock()
   908  
   909  	cc := js.cluster
   910  
   911  	// If we are not a member we can not say..
   912  	if cc.meta == nil {
   913  		return false
   914  	}
   915  	if !rg.isMember(cc.meta.ID()) {
   916  		return false
   917  	}
   918  	// Single peer groups always have a leader if we are here.
   919  	if rg.node == nil {
   920  		return false
   921  	}
   922  	// If we don't have a leader.
   923  	if rg.node.GroupLeader() == _EMPTY_ {
   924  		// Threshold for jetstream startup.
   925  		const startupThreshold = 10 * time.Second
   926  
   927  		if rg.node.HadPreviousLeader() {
   928  			// Make sure we have been running long enough to intelligently determine this.
   929  			if time.Since(js.started) > startupThreshold {
   930  				return true
   931  			}
   932  		}
   933  		// Make sure we have been running for enough time.
   934  		if time.Since(rg.node.Created()) > lostQuorumIntervalDefault {
   935  			return true
   936  		}
   937  	}
   938  
   939  	return false
   940  }
   941  
   942  func (s *Server) JetStreamIsStreamAssigned(account, stream string) bool {
   943  	js, cc := s.getJetStreamCluster()
   944  	if js == nil || cc == nil {
   945  		return false
   946  	}
   947  	acc, _ := s.LookupAccount(account)
   948  	if acc == nil {
   949  		return false
   950  	}
   951  	js.mu.RLock()
   952  	assigned := cc.isStreamAssigned(acc, stream)
   953  	js.mu.RUnlock()
   954  	return assigned
   955  }
   956  
   957  // streamAssigned informs us if this server has this stream assigned.
   958  func (jsa *jsAccount) streamAssigned(stream string) bool {
   959  	jsa.mu.RLock()
   960  	js, acc := jsa.js, jsa.account
   961  	jsa.mu.RUnlock()
   962  
   963  	if js == nil {
   964  		return false
   965  	}
   966  	js.mu.RLock()
   967  	assigned := js.cluster.isStreamAssigned(acc, stream)
   968  	js.mu.RUnlock()
   969  	return assigned
   970  }
   971  
   972  // Read lock should be held.
   973  func (cc *jetStreamCluster) isStreamAssigned(a *Account, stream string) bool {
   974  	// Non-clustered mode always return true.
   975  	if cc == nil {
   976  		return true
   977  	}
   978  	if cc.meta == nil {
   979  		return false
   980  	}
   981  	as := cc.streams[a.Name]
   982  	if as == nil {
   983  		return false
   984  	}
   985  	sa := as[stream]
   986  	if sa == nil {
   987  		return false
   988  	}
   989  	rg := sa.Group
   990  	if rg == nil {
   991  		return false
   992  	}
   993  	// Check if we are the leader of this raftGroup assigned to the stream.
   994  	ourID := cc.meta.ID()
   995  	for _, peer := range rg.Peers {
   996  		if peer == ourID {
   997  			return true
   998  		}
   999  	}
  1000  	return false
  1001  }
  1002  
  1003  // Read lock should be held.
  1004  func (cc *jetStreamCluster) isStreamLeader(account, stream string) bool {
  1005  	// Non-clustered mode always return true.
  1006  	if cc == nil {
  1007  		return true
  1008  	}
  1009  	if cc.meta == nil {
  1010  		return false
  1011  	}
  1012  
  1013  	var sa *streamAssignment
  1014  	if as := cc.streams[account]; as != nil {
  1015  		sa = as[stream]
  1016  	}
  1017  	if sa == nil {
  1018  		return false
  1019  	}
  1020  	rg := sa.Group
  1021  	if rg == nil {
  1022  		return false
  1023  	}
  1024  	// Check if we are the leader of this raftGroup assigned to the stream.
  1025  	ourID := cc.meta.ID()
  1026  	for _, peer := range rg.Peers {
  1027  		if peer == ourID {
  1028  			if len(rg.Peers) == 1 || rg.node != nil && rg.node.Leader() {
  1029  				return true
  1030  			}
  1031  		}
  1032  	}
  1033  	return false
  1034  }
  1035  
  1036  // Read lock should be held.
  1037  func (cc *jetStreamCluster) isConsumerLeader(account, stream, consumer string) bool {
  1038  	// Non-clustered mode always return true.
  1039  	if cc == nil {
  1040  		return true
  1041  	}
  1042  	if cc.meta == nil {
  1043  		return false
  1044  	}
  1045  
  1046  	var sa *streamAssignment
  1047  	if as := cc.streams[account]; as != nil {
  1048  		sa = as[stream]
  1049  	}
  1050  	if sa == nil {
  1051  		return false
  1052  	}
  1053  	// Check if we are the leader of this raftGroup assigned to this consumer.
  1054  	ca := sa.consumers[consumer]
  1055  	if ca == nil {
  1056  		return false
  1057  	}
  1058  	rg := ca.Group
  1059  	ourID := cc.meta.ID()
  1060  	for _, peer := range rg.Peers {
  1061  		if peer == ourID {
  1062  			if len(rg.Peers) == 1 || (rg.node != nil && rg.node.Leader()) {
  1063  				return true
  1064  			}
  1065  		}
  1066  	}
  1067  	return false
  1068  }
  1069  
  1070  // Remove the stream `streamName` for the account `accName` from the inflight
  1071  // proposals map. This is done on success (processStreamAssignment) or on
  1072  // failure (processStreamAssignmentResults).
  1073  // (Write) Lock held on entry.
  1074  func (cc *jetStreamCluster) removeInflightProposal(accName, streamName string) {
  1075  	streams, ok := cc.inflight[accName]
  1076  	if !ok {
  1077  		return
  1078  	}
  1079  	delete(streams, streamName)
  1080  	if len(streams) == 0 {
  1081  		delete(cc.inflight, accName)
  1082  	}
  1083  }
  1084  
  1085  // Return the cluster quit chan.
  1086  func (js *jetStream) clusterQuitC() chan struct{} {
  1087  	js.mu.RLock()
  1088  	defer js.mu.RUnlock()
  1089  	if js.cluster != nil {
  1090  		return js.cluster.qch
  1091  	}
  1092  	return nil
  1093  }
  1094  
  1095  // Mark that the meta layer is recovering.
  1096  func (js *jetStream) setMetaRecovering() {
  1097  	js.mu.Lock()
  1098  	defer js.mu.Unlock()
  1099  	if js.cluster != nil {
  1100  		// metaRecovering
  1101  		js.metaRecovering = true
  1102  	}
  1103  }
  1104  
  1105  // Mark that the meta layer is no longer recovering.
  1106  func (js *jetStream) clearMetaRecovering() {
  1107  	js.mu.Lock()
  1108  	defer js.mu.Unlock()
  1109  	js.metaRecovering = false
  1110  }
  1111  
  1112  // Return whether the meta layer is recovering.
  1113  func (js *jetStream) isMetaRecovering() bool {
  1114  	js.mu.RLock()
  1115  	defer js.mu.RUnlock()
  1116  	return js.metaRecovering
  1117  }
  1118  
  1119  // During recovery track any stream and consumer delete and update operations.
  1120  type recoveryUpdates struct {
  1121  	removeStreams   map[string]*streamAssignment
  1122  	removeConsumers map[string]*consumerAssignment
  1123  	updateStreams   map[string]*streamAssignment
  1124  	updateConsumers map[string]*consumerAssignment
  1125  }
  1126  
  1127  // Called after recovery of the cluster on startup to check for any orphans.
  1128  // Streams and consumers are recovered from disk, and the meta layer's mappings
  1129  // should clean them up, but under crash scenarios there could be orphans.
  1130  func (js *jetStream) checkForOrphans() {
  1131  	consumerName := func(o *consumer) string {
  1132  		o.mu.RLock()
  1133  		defer o.mu.RUnlock()
  1134  		return o.name
  1135  	}
  1136  
  1137  	// Can not hold jetstream lock while trying to delete streams or consumers.
  1138  	js.mu.Lock()
  1139  	s, cc := js.srv, js.cluster
  1140  	s.Debugf("JetStream cluster checking for orphans")
  1141  
  1142  	var streams []*stream
  1143  	var consumers []*consumer
  1144  
  1145  	for accName, jsa := range js.accounts {
  1146  		asa := cc.streams[accName]
  1147  		jsa.mu.RLock()
  1148  		for stream, mset := range jsa.streams {
  1149  			if sa := asa[stream]; sa == nil {
  1150  				streams = append(streams, mset)
  1151  			} else {
  1152  				// This one is good, check consumers now.
  1153  				for _, o := range mset.getConsumers() {
  1154  					consumer := consumerName(o)
  1155  					if sa.consumers[consumer] == nil {
  1156  						consumers = append(consumers, o)
  1157  					}
  1158  				}
  1159  			}
  1160  		}
  1161  		jsa.mu.RUnlock()
  1162  	}
  1163  	js.mu.Unlock()
  1164  
  1165  	for _, mset := range streams {
  1166  		mset.mu.RLock()
  1167  		accName, stream := mset.acc.Name, mset.cfg.Name
  1168  		mset.mu.RUnlock()
  1169  		s.Warnf("Detected orphaned stream '%s > %s', will cleanup", accName, stream)
  1170  		if err := mset.delete(); err != nil {
  1171  			s.Warnf("Deleting stream encountered an error: %v", err)
  1172  		}
  1173  	}
  1174  	for _, o := range consumers {
  1175  		o.mu.RLock()
  1176  		accName, mset, consumer := o.acc.Name, o.mset, o.name
  1177  		o.mu.RUnlock()
  1178  		stream := "N/A"
  1179  		if mset != nil {
  1180  			mset.mu.RLock()
  1181  			stream = mset.cfg.Name
  1182  			mset.mu.RUnlock()
  1183  		}
  1184  		s.Warnf("Detected orphaned consumer '%s > %s > %s', will cleanup", accName, stream, consumer)
  1185  		if err := o.delete(); err != nil {
  1186  			s.Warnf("Deleting consumer encountered an error: %v", err)
  1187  		}
  1188  	}
  1189  }
  1190  
  1191  // Check and delete any orphans we may come across.
  1192  func (s *Server) checkForNRGOrphans() {
  1193  	js, cc := s.getJetStreamCluster()
  1194  	if js == nil || cc == nil || js.isMetaRecovering() {
  1195  		// No cluster means no NRGs. Also return if still recovering.
  1196  		return
  1197  	}
  1198  
  1199  	// Track which assets R>1 should be on this server.
  1200  	nrgMap := make(map[string]struct{})
  1201  	trackGroup := func(rg *raftGroup) {
  1202  		// If R>1 track this as a legit NRG.
  1203  		if rg.node != nil {
  1204  			nrgMap[rg.Name] = struct{}{}
  1205  		}
  1206  	}
  1207  	// Register our meta.
  1208  	js.mu.RLock()
  1209  	meta := cc.meta
  1210  	if meta == nil {
  1211  		js.mu.RUnlock()
  1212  		// Bail with no meta node.
  1213  		return
  1214  	}
  1215  
  1216  	ourID := meta.ID()
  1217  	nrgMap[meta.Group()] = struct{}{}
  1218  
  1219  	// Collect all valid groups from our assignments.
  1220  	for _, asa := range cc.streams {
  1221  		for _, sa := range asa {
  1222  			if sa.Group.isMember(ourID) && sa.Restore == nil {
  1223  				trackGroup(sa.Group)
  1224  				for _, ca := range sa.consumers {
  1225  					if ca.Group.isMember(ourID) {
  1226  						trackGroup(ca.Group)
  1227  					}
  1228  				}
  1229  			}
  1230  		}
  1231  	}
  1232  	js.mu.RUnlock()
  1233  
  1234  	// Check NRGs that are running.
  1235  	var needDelete []RaftNode
  1236  	s.rnMu.RLock()
  1237  	for name, n := range s.raftNodes {
  1238  		if _, ok := nrgMap[name]; !ok {
  1239  			needDelete = append(needDelete, n)
  1240  		}
  1241  	}
  1242  	s.rnMu.RUnlock()
  1243  
  1244  	for _, n := range needDelete {
  1245  		s.Warnf("Detected orphaned NRG %q, will cleanup", n.Group())
  1246  		n.Delete()
  1247  	}
  1248  }
  1249  
  1250  func (js *jetStream) monitorCluster() {
  1251  	s, n := js.server(), js.getMetaGroup()
  1252  	qch, rqch, lch, aq := js.clusterQuitC(), n.QuitC(), n.LeadChangeC(), n.ApplyQ()
  1253  
  1254  	defer s.grWG.Done()
  1255  
  1256  	s.Debugf("Starting metadata monitor")
  1257  	defer s.Debugf("Exiting metadata monitor")
  1258  
  1259  	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
  1260  	defer n.Stop()
  1261  	defer s.isMetaLeader.Store(false)
  1262  
  1263  	const compactInterval = time.Minute
  1264  	t := time.NewTicker(compactInterval)
  1265  	defer t.Stop()
  1266  
  1267  	// Used to check cold boot cluster when possibly in mixed mode.
  1268  	const leaderCheckInterval = time.Second
  1269  	lt := time.NewTicker(leaderCheckInterval)
  1270  	defer lt.Stop()
  1271  
  1272  	// Check the general health once an hour.
  1273  	const healthCheckInterval = 1 * time.Hour
  1274  	ht := time.NewTicker(healthCheckInterval)
  1275  	defer ht.Stop()
  1276  
  1277  	// Utility to check health.
  1278  	checkHealth := func() {
  1279  		if hs := s.healthz(nil); hs.Error != _EMPTY_ {
  1280  			s.Warnf("%v", hs.Error)
  1281  		}
  1282  		// Also check for orphaned NRGs.
  1283  		s.checkForNRGOrphans()
  1284  	}
  1285  
  1286  	var (
  1287  		isLeader       bool
  1288  		lastSnapTime   time.Time
  1289  		compactSizeMin = uint64(8 * 1024 * 1024) // 8MB
  1290  		minSnapDelta   = 10 * time.Second
  1291  	)
  1292  
  1293  	// Highwayhash key for generating hashes.
  1294  	key := make([]byte, 32)
  1295  	crand.Read(key)
  1296  
  1297  	// Set to true to start.
  1298  	js.setMetaRecovering()
  1299  
  1300  	// Snapshotting function.
  1301  	doSnapshot := func() {
  1302  		// Suppress during recovery.
  1303  		if js.isMetaRecovering() {
  1304  			return
  1305  		}
  1306  		// For the meta layer we want to snapshot when asked if we need one or have any entries that we can compact.
  1307  		if ne, _ := n.Size(); ne > 0 || n.NeedSnapshot() {
  1308  			if err := n.InstallSnapshot(js.metaSnapshot()); err == nil {
  1309  				lastSnapTime = time.Now()
  1310  			} else if err != errNoSnapAvailable && err != errNodeClosed {
  1311  				s.Warnf("Error snapshotting JetStream cluster state: %v", err)
  1312  			}
  1313  		}
  1314  	}
  1315  
  1316  	ru := &recoveryUpdates{
  1317  		removeStreams:   make(map[string]*streamAssignment),
  1318  		removeConsumers: make(map[string]*consumerAssignment),
  1319  		updateStreams:   make(map[string]*streamAssignment),
  1320  		updateConsumers: make(map[string]*consumerAssignment),
  1321  	}
  1322  
  1323  	for {
  1324  		select {
  1325  		case <-s.quitCh:
  1326  			return
  1327  		case <-rqch:
  1328  			return
  1329  		case <-qch:
  1330  			// Clean signal from shutdown routine so do best effort attempt to snapshot meta layer.
  1331  			doSnapshot()
  1332  			// Return the signal back since shutdown will be waiting.
  1333  			close(qch)
  1334  			return
  1335  		case <-aq.ch:
  1336  			ces := aq.pop()
  1337  			for _, ce := range ces {
  1338  				if ce == nil {
  1339  					// Signals we have replayed all of our metadata.
  1340  					js.clearMetaRecovering()
  1341  					// Process any removes that are still valid after recovery.
  1342  					for _, ca := range ru.removeConsumers {
  1343  						js.processConsumerRemoval(ca)
  1344  					}
  1345  					for _, sa := range ru.removeStreams {
  1346  						js.processStreamRemoval(sa)
  1347  					}
  1348  					// Process pending updates.
  1349  					for _, sa := range ru.updateStreams {
  1350  						js.processUpdateStreamAssignment(sa)
  1351  					}
  1352  					// Now consumers.
  1353  					for _, ca := range ru.updateConsumers {
  1354  						js.processConsumerAssignment(ca)
  1355  					}
  1356  					// Clear.
  1357  					ru = nil
  1358  					s.Debugf("Recovered JetStream cluster metadata")
  1359  					js.checkForOrphans()
  1360  					// Do a health check here as well.
  1361  					go checkHealth()
  1362  					continue
  1363  				}
  1364  				if didSnap, didStreamRemoval, didConsumerRemoval, err := js.applyMetaEntries(ce.Entries, ru); err == nil {
  1365  					_, nb := n.Applied(ce.Index)
  1366  					if js.hasPeerEntries(ce.Entries) || didStreamRemoval || (didSnap && !isLeader) {
  1367  						doSnapshot()
  1368  					} else if didConsumerRemoval && time.Since(lastSnapTime) > minSnapDelta/2 {
  1369  						doSnapshot()
  1370  					} else if nb > compactSizeMin && time.Since(lastSnapTime) > minSnapDelta {
  1371  						doSnapshot()
  1372  					}
  1373  					ce.ReturnToPool()
  1374  				} else {
  1375  					s.Warnf("Error applying JetStream cluster entries: %v", err)
  1376  				}
  1377  			}
  1378  			aq.recycle(&ces)
  1379  
  1380  		case isLeader = <-lch:
  1381  			// For meta layer synchronize everyone to our state on becoming leader.
  1382  			if isLeader && n.ApplyQ().len() == 0 {
  1383  				n.SendSnapshot(js.metaSnapshot())
  1384  			}
  1385  			// Process the change.
  1386  			js.processLeaderChange(isLeader)
  1387  			if isLeader {
  1388  				s.sendInternalMsgLocked(serverStatsPingReqSubj, _EMPTY_, nil, nil)
  1389  				// Install a snapshot as we become leader.
  1390  				js.checkClusterSize()
  1391  				doSnapshot()
  1392  			}
  1393  
  1394  		case <-t.C:
  1395  			doSnapshot()
  1396  			// Periodically check the cluster size.
  1397  			if n.Leader() {
  1398  				js.checkClusterSize()
  1399  			}
  1400  		case <-ht.C:
  1401  			// Do this in a separate go routine.
  1402  			go checkHealth()
  1403  
  1404  		case <-lt.C:
  1405  			s.Debugf("Checking JetStream cluster state")
  1406  			// If we have a current leader or had one in the past we can cancel this here since the metaleader
  1407  			// will be in charge of all peer state changes.
  1408  			// For cold boot only.
  1409  			if n.GroupLeader() != _EMPTY_ || n.HadPreviousLeader() {
  1410  				lt.Stop()
  1411  				continue
  1412  			}
  1413  			// If we are here we do not have a leader and we did not have a previous one, so cold start.
  1414  			// Check to see if we can adjust our cluster size down iff we are in mixed mode and we have
  1415  			// seen a total that is what our original estimate was.
  1416  			cs := n.ClusterSize()
  1417  			if js, total := s.trackedJetStreamServers(); js < total && total >= cs && js != cs {
  1418  				s.Noticef("Adjusting JetStream expected peer set size to %d from original %d", js, cs)
  1419  				n.AdjustBootClusterSize(js)
  1420  			}
  1421  		}
  1422  	}
  1423  }
  1424  
  1425  // This is called on first leader transition to double check the peers and cluster set size.
  1426  func (js *jetStream) checkClusterSize() {
  1427  	s, n := js.server(), js.getMetaGroup()
  1428  	if n == nil {
  1429  		return
  1430  	}
  1431  	// We will check that we have a correct cluster set size by checking for any non-js servers
  1432  	// which can happen in mixed mode.
  1433  	ps := n.(*raft).currentPeerState()
  1434  	if len(ps.knownPeers) >= ps.clusterSize {
  1435  		return
  1436  	}
  1437  
  1438  	// Grab our active peers.
  1439  	peers := s.ActivePeers()
  1440  
  1441  	// If we have not registered all of our peers yet we can't do
  1442  	// any adjustments based on a mixed mode. We will periodically check back.
  1443  	if len(peers) < ps.clusterSize {
  1444  		return
  1445  	}
  1446  
  1447  	s.Debugf("Checking JetStream cluster size")
  1448  
  1449  	// If we are here our known set as the leader is not the same as the cluster size.
  1450  	// Check to see if we have a mixed mode setup.
  1451  	var totalJS int
  1452  	for _, p := range peers {
  1453  		if si, ok := s.nodeToInfo.Load(p); ok && si != nil {
  1454  			if si.(nodeInfo).js {
  1455  				totalJS++
  1456  			}
  1457  		}
  1458  	}
  1459  	// If we have less then our cluster size adjust that here. Can not do individual peer removals since
  1460  	// they will not be in the tracked peers.
  1461  	if totalJS < ps.clusterSize {
  1462  		s.Debugf("Adjusting JetStream cluster size from %d to %d", ps.clusterSize, totalJS)
  1463  		if err := n.AdjustClusterSize(totalJS); err != nil {
  1464  			s.Warnf("Error adjusting JetStream cluster size: %v", err)
  1465  		}
  1466  	}
  1467  }
  1468  
  1469  // Represents our stable meta state that we can write out.
  1470  type writeableStreamAssignment struct {
  1471  	Client    *ClientInfo   `json:"client,omitempty"`
  1472  	Created   time.Time     `json:"created"`
  1473  	Config    *StreamConfig `json:"stream"`
  1474  	Group     *raftGroup    `json:"group"`
  1475  	Sync      string        `json:"sync"`
  1476  	Consumers []*consumerAssignment
  1477  }
  1478  
  1479  func (js *jetStream) clusterStreamConfig(accName, streamName string) (StreamConfig, bool) {
  1480  	js.mu.RLock()
  1481  	defer js.mu.RUnlock()
  1482  	if sa, ok := js.cluster.streams[accName][streamName]; ok {
  1483  		return *sa.Config, true
  1484  	}
  1485  	return StreamConfig{}, false
  1486  }
  1487  
  1488  func (js *jetStream) metaSnapshot() []byte {
  1489  	js.mu.RLock()
  1490  	cc := js.cluster
  1491  	nsa := 0
  1492  	for _, asa := range cc.streams {
  1493  		nsa += len(asa)
  1494  	}
  1495  	streams := make([]writeableStreamAssignment, 0, nsa)
  1496  	for _, asa := range cc.streams {
  1497  		for _, sa := range asa {
  1498  			wsa := writeableStreamAssignment{
  1499  				Client:    sa.Client,
  1500  				Created:   sa.Created,
  1501  				Config:    sa.Config,
  1502  				Group:     sa.Group,
  1503  				Sync:      sa.Sync,
  1504  				Consumers: make([]*consumerAssignment, 0, len(sa.consumers)),
  1505  			}
  1506  			for _, ca := range sa.consumers {
  1507  				wsa.Consumers = append(wsa.Consumers, ca)
  1508  			}
  1509  			streams = append(streams, wsa)
  1510  		}
  1511  	}
  1512  
  1513  	if len(streams) == 0 {
  1514  		js.mu.RUnlock()
  1515  		return nil
  1516  	}
  1517  
  1518  	b, _ := json.Marshal(streams)
  1519  	js.mu.RUnlock()
  1520  
  1521  	return s2.EncodeBetter(nil, b)
  1522  }
  1523  
  1524  func (js *jetStream) applyMetaSnapshot(buf []byte, ru *recoveryUpdates, isRecovering bool) error {
  1525  	var wsas []writeableStreamAssignment
  1526  	if len(buf) > 0 {
  1527  		jse, err := s2.Decode(nil, buf)
  1528  		if err != nil {
  1529  			return err
  1530  		}
  1531  		if err = json.Unmarshal(jse, &wsas); err != nil {
  1532  			return err
  1533  		}
  1534  	}
  1535  
  1536  	// Build our new version here outside of js.
  1537  	streams := make(map[string]map[string]*streamAssignment)
  1538  	for _, wsa := range wsas {
  1539  		fixCfgMirrorWithDedupWindow(wsa.Config)
  1540  		as := streams[wsa.Client.serviceAccount()]
  1541  		if as == nil {
  1542  			as = make(map[string]*streamAssignment)
  1543  			streams[wsa.Client.serviceAccount()] = as
  1544  		}
  1545  		sa := &streamAssignment{Client: wsa.Client, Created: wsa.Created, Config: wsa.Config, Group: wsa.Group, Sync: wsa.Sync}
  1546  		if len(wsa.Consumers) > 0 {
  1547  			sa.consumers = make(map[string]*consumerAssignment)
  1548  			for _, ca := range wsa.Consumers {
  1549  				sa.consumers[ca.Name] = ca
  1550  			}
  1551  		}
  1552  		as[wsa.Config.Name] = sa
  1553  	}
  1554  
  1555  	js.mu.Lock()
  1556  	cc := js.cluster
  1557  
  1558  	var saAdd, saDel, saChk []*streamAssignment
  1559  	// Walk through the old list to generate the delete list.
  1560  	for account, asa := range cc.streams {
  1561  		nasa := streams[account]
  1562  		for sn, sa := range asa {
  1563  			if nsa := nasa[sn]; nsa == nil {
  1564  				saDel = append(saDel, sa)
  1565  			} else {
  1566  				saChk = append(saChk, nsa)
  1567  			}
  1568  		}
  1569  	}
  1570  	// Walk through the new list to generate the add list.
  1571  	for account, nasa := range streams {
  1572  		asa := cc.streams[account]
  1573  		for sn, sa := range nasa {
  1574  			if asa[sn] == nil {
  1575  				saAdd = append(saAdd, sa)
  1576  			}
  1577  		}
  1578  	}
  1579  	// Now walk the ones to check and process consumers.
  1580  	var caAdd, caDel []*consumerAssignment
  1581  	for _, sa := range saChk {
  1582  		// Make sure to add in all the new ones from sa.
  1583  		for _, ca := range sa.consumers {
  1584  			caAdd = append(caAdd, ca)
  1585  		}
  1586  		if osa := js.streamAssignment(sa.Client.serviceAccount(), sa.Config.Name); osa != nil {
  1587  			for _, ca := range osa.consumers {
  1588  				if sa.consumers[ca.Name] == nil {
  1589  					caDel = append(caDel, ca)
  1590  				} else {
  1591  					caAdd = append(caAdd, ca)
  1592  				}
  1593  			}
  1594  		}
  1595  	}
  1596  	js.mu.Unlock()
  1597  
  1598  	// Do removals first.
  1599  	for _, sa := range saDel {
  1600  		js.setStreamAssignmentRecovering(sa)
  1601  		if isRecovering {
  1602  			key := sa.recoveryKey()
  1603  			ru.removeStreams[key] = sa
  1604  			delete(ru.updateStreams, key)
  1605  		} else {
  1606  			js.processStreamRemoval(sa)
  1607  		}
  1608  	}
  1609  	// Now do add for the streams. Also add in all consumers.
  1610  	for _, sa := range saAdd {
  1611  		js.setStreamAssignmentRecovering(sa)
  1612  		js.processStreamAssignment(sa)
  1613  
  1614  		// We can simply process the consumers.
  1615  		for _, ca := range sa.consumers {
  1616  			js.setConsumerAssignmentRecovering(ca)
  1617  			js.processConsumerAssignment(ca)
  1618  		}
  1619  	}
  1620  
  1621  	// Perform updates on those in saChk. These were existing so make
  1622  	// sure to process any changes.
  1623  	for _, sa := range saChk {
  1624  		js.setStreamAssignmentRecovering(sa)
  1625  		if isRecovering {
  1626  			key := sa.recoveryKey()
  1627  			ru.updateStreams[key] = sa
  1628  			delete(ru.removeStreams, key)
  1629  		} else {
  1630  			js.processUpdateStreamAssignment(sa)
  1631  		}
  1632  	}
  1633  
  1634  	// Now do the deltas for existing stream's consumers.
  1635  	for _, ca := range caDel {
  1636  		js.setConsumerAssignmentRecovering(ca)
  1637  		if isRecovering {
  1638  			key := ca.recoveryKey()
  1639  			ru.removeConsumers[key] = ca
  1640  			delete(ru.updateConsumers, key)
  1641  		} else {
  1642  			js.processConsumerRemoval(ca)
  1643  		}
  1644  	}
  1645  	for _, ca := range caAdd {
  1646  		js.setConsumerAssignmentRecovering(ca)
  1647  		if isRecovering {
  1648  			key := ca.recoveryKey()
  1649  			delete(ru.removeConsumers, key)
  1650  			ru.updateConsumers[key] = ca
  1651  		} else {
  1652  			js.processConsumerAssignment(ca)
  1653  		}
  1654  	}
  1655  
  1656  	return nil
  1657  }
  1658  
  1659  // Called on recovery to make sure we do not process like original.
  1660  func (js *jetStream) setStreamAssignmentRecovering(sa *streamAssignment) {
  1661  	js.mu.Lock()
  1662  	defer js.mu.Unlock()
  1663  	sa.responded = true
  1664  	sa.recovering = true
  1665  	sa.Restore = nil
  1666  	if sa.Group != nil {
  1667  		sa.Group.Preferred = _EMPTY_
  1668  	}
  1669  }
  1670  
  1671  // Called on recovery to make sure we do not process like original.
  1672  func (js *jetStream) setConsumerAssignmentRecovering(ca *consumerAssignment) {
  1673  	js.mu.Lock()
  1674  	defer js.mu.Unlock()
  1675  	ca.responded = true
  1676  	ca.recovering = true
  1677  	if ca.Group != nil {
  1678  		ca.Group.Preferred = _EMPTY_
  1679  	}
  1680  }
  1681  
  1682  // Just copies over and changes out the group so it can be encoded.
  1683  // Lock should be held.
  1684  func (sa *streamAssignment) copyGroup() *streamAssignment {
  1685  	csa, cg := *sa, *sa.Group
  1686  	csa.Group = &cg
  1687  	csa.Group.Peers = copyStrings(sa.Group.Peers)
  1688  	return &csa
  1689  }
  1690  
  1691  // Just copies over and changes out the group so it can be encoded.
  1692  // Lock should be held.
  1693  func (ca *consumerAssignment) copyGroup() *consumerAssignment {
  1694  	cca, cg := *ca, *ca.Group
  1695  	cca.Group = &cg
  1696  	cca.Group.Peers = copyStrings(ca.Group.Peers)
  1697  	return &cca
  1698  }
  1699  
  1700  // Lock should be held.
  1701  func (sa *streamAssignment) missingPeers() bool {
  1702  	return len(sa.Group.Peers) < sa.Config.Replicas
  1703  }
  1704  
  1705  // Called when we detect a new peer. Only the leader will process checking
  1706  // for any streams, and consequently any consumers.
  1707  func (js *jetStream) processAddPeer(peer string) {
  1708  	js.mu.Lock()
  1709  	defer js.mu.Unlock()
  1710  
  1711  	s, cc := js.srv, js.cluster
  1712  	if cc == nil || cc.meta == nil {
  1713  		return
  1714  	}
  1715  	isLeader := cc.isLeader()
  1716  
  1717  	// Now check if we are meta-leader. We will check for any re-assignments.
  1718  	if !isLeader {
  1719  		return
  1720  	}
  1721  
  1722  	sir, ok := s.nodeToInfo.Load(peer)
  1723  	if !ok || sir == nil {
  1724  		return
  1725  	}
  1726  	si := sir.(nodeInfo)
  1727  
  1728  	for _, asa := range cc.streams {
  1729  		for _, sa := range asa {
  1730  			if sa.missingPeers() {
  1731  				// Make sure the right cluster etc.
  1732  				if si.cluster != sa.Client.Cluster {
  1733  					continue
  1734  				}
  1735  				// If we are here we can add in this peer.
  1736  				csa := sa.copyGroup()
  1737  				csa.Group.Peers = append(csa.Group.Peers, peer)
  1738  				// Send our proposal for this csa. Also use same group definition for all the consumers as well.
  1739  				cc.meta.Propose(encodeAddStreamAssignment(csa))
  1740  				for _, ca := range sa.consumers {
  1741  					// Ephemerals are R=1, so only auto-remap durables, or R>1.
  1742  					if ca.Config.Durable != _EMPTY_ || len(ca.Group.Peers) > 1 {
  1743  						cca := ca.copyGroup()
  1744  						cca.Group.Peers = csa.Group.Peers
  1745  						cc.meta.Propose(encodeAddConsumerAssignment(cca))
  1746  					}
  1747  				}
  1748  			}
  1749  		}
  1750  	}
  1751  }
  1752  
  1753  func (js *jetStream) processRemovePeer(peer string) {
  1754  	// We may be already disabled.
  1755  	if js == nil || js.disabled.Load() {
  1756  		return
  1757  	}
  1758  
  1759  	js.mu.Lock()
  1760  	s, cc := js.srv, js.cluster
  1761  	if cc == nil || cc.meta == nil {
  1762  		js.mu.Unlock()
  1763  		return
  1764  	}
  1765  	isLeader := cc.isLeader()
  1766  	// All nodes will check if this is them.
  1767  	isUs := cc.meta.ID() == peer
  1768  	js.mu.Unlock()
  1769  
  1770  	if isUs {
  1771  		s.Errorf("JetStream being DISABLED, our server was removed from the cluster")
  1772  		adv := &JSServerRemovedAdvisory{
  1773  			TypedEvent: TypedEvent{
  1774  				Type: JSServerRemovedAdvisoryType,
  1775  				ID:   nuid.Next(),
  1776  				Time: time.Now().UTC(),
  1777  			},
  1778  			Server:   s.Name(),
  1779  			ServerID: s.ID(),
  1780  			Cluster:  s.cachedClusterName(),
  1781  			Domain:   s.getOpts().JetStreamDomain,
  1782  		}
  1783  		s.publishAdvisory(nil, JSAdvisoryServerRemoved, adv)
  1784  
  1785  		go s.DisableJetStream()
  1786  	}
  1787  
  1788  	// Now check if we are meta-leader. We will attempt re-assignment.
  1789  	if !isLeader {
  1790  		return
  1791  	}
  1792  
  1793  	js.mu.Lock()
  1794  	defer js.mu.Unlock()
  1795  
  1796  	for _, asa := range cc.streams {
  1797  		for _, sa := range asa {
  1798  			if rg := sa.Group; rg.isMember(peer) {
  1799  				js.removePeerFromStreamLocked(sa, peer)
  1800  			}
  1801  		}
  1802  	}
  1803  }
  1804  
  1805  // Assumes all checks have already been done.
  1806  func (js *jetStream) removePeerFromStream(sa *streamAssignment, peer string) bool {
  1807  	js.mu.Lock()
  1808  	defer js.mu.Unlock()
  1809  	return js.removePeerFromStreamLocked(sa, peer)
  1810  }
  1811  
  1812  // Lock should be held.
  1813  func (js *jetStream) removePeerFromStreamLocked(sa *streamAssignment, peer string) bool {
  1814  	if rg := sa.Group; !rg.isMember(peer) {
  1815  		return false
  1816  	}
  1817  
  1818  	s, cc, csa := js.srv, js.cluster, sa.copyGroup()
  1819  	if cc == nil || cc.meta == nil {
  1820  		return false
  1821  	}
  1822  	replaced := cc.remapStreamAssignment(csa, peer)
  1823  	if !replaced {
  1824  		s.Warnf("JetStream cluster could not replace peer for stream '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name)
  1825  	}
  1826  
  1827  	// Send our proposal for this csa. Also use same group definition for all the consumers as well.
  1828  	cc.meta.Propose(encodeAddStreamAssignment(csa))
  1829  	rg := csa.Group
  1830  	for _, ca := range sa.consumers {
  1831  		// Ephemerals are R=1, so only auto-remap durables, or R>1.
  1832  		if ca.Config.Durable != _EMPTY_ {
  1833  			cca := ca.copyGroup()
  1834  			cca.Group.Peers, cca.Group.Preferred = rg.Peers, _EMPTY_
  1835  			cc.meta.Propose(encodeAddConsumerAssignment(cca))
  1836  		} else if ca.Group.isMember(peer) {
  1837  			// These are ephemerals. Check to see if we deleted this peer.
  1838  			cc.meta.Propose(encodeDeleteConsumerAssignment(ca))
  1839  		}
  1840  	}
  1841  	return replaced
  1842  }
  1843  
  1844  // Check if we have peer related entries.
  1845  func (js *jetStream) hasPeerEntries(entries []*Entry) bool {
  1846  	for _, e := range entries {
  1847  		if e.Type == EntryRemovePeer || e.Type == EntryAddPeer {
  1848  			return true
  1849  		}
  1850  	}
  1851  	return false
  1852  }
  1853  
  1854  const ksep = ":"
  1855  
  1856  func (sa *streamAssignment) recoveryKey() string {
  1857  	if sa == nil {
  1858  		return _EMPTY_
  1859  	}
  1860  	return sa.Client.serviceAccount() + ksep + sa.Config.Name
  1861  }
  1862  
  1863  func (ca *consumerAssignment) recoveryKey() string {
  1864  	if ca == nil {
  1865  		return _EMPTY_
  1866  	}
  1867  	return ca.Client.serviceAccount() + ksep + ca.Stream + ksep + ca.Name
  1868  }
  1869  
  1870  func (js *jetStream) applyMetaEntries(entries []*Entry, ru *recoveryUpdates) (bool, bool, bool, error) {
  1871  	var didSnap, didRemoveStream, didRemoveConsumer bool
  1872  	isRecovering := js.isMetaRecovering()
  1873  
  1874  	for _, e := range entries {
  1875  		if e.Type == EntrySnapshot {
  1876  			js.applyMetaSnapshot(e.Data, ru, isRecovering)
  1877  			didSnap = true
  1878  		} else if e.Type == EntryRemovePeer {
  1879  			if !isRecovering {
  1880  				js.processRemovePeer(string(e.Data))
  1881  			}
  1882  		} else if e.Type == EntryAddPeer {
  1883  			if !isRecovering {
  1884  				js.processAddPeer(string(e.Data))
  1885  			}
  1886  		} else {
  1887  			buf := e.Data
  1888  			switch entryOp(buf[0]) {
  1889  			case assignStreamOp:
  1890  				sa, err := decodeStreamAssignment(buf[1:])
  1891  				if err != nil {
  1892  					js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:])
  1893  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1894  				}
  1895  				if isRecovering {
  1896  					js.setStreamAssignmentRecovering(sa)
  1897  					delete(ru.removeStreams, sa.recoveryKey())
  1898  				}
  1899  				if js.processStreamAssignment(sa) {
  1900  					didRemoveStream = true
  1901  				}
  1902  			case removeStreamOp:
  1903  				sa, err := decodeStreamAssignment(buf[1:])
  1904  				if err != nil {
  1905  					js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:])
  1906  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1907  				}
  1908  				if isRecovering {
  1909  					js.setStreamAssignmentRecovering(sa)
  1910  					key := sa.recoveryKey()
  1911  					ru.removeStreams[key] = sa
  1912  					delete(ru.updateStreams, key)
  1913  				} else {
  1914  					js.processStreamRemoval(sa)
  1915  					didRemoveStream = true
  1916  				}
  1917  			case assignConsumerOp:
  1918  				ca, err := decodeConsumerAssignment(buf[1:])
  1919  				if err != nil {
  1920  					js.srv.Errorf("JetStream cluster failed to decode consumer assignment: %q", buf[1:])
  1921  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1922  				}
  1923  				if isRecovering {
  1924  					js.setConsumerAssignmentRecovering(ca)
  1925  					key := ca.recoveryKey()
  1926  					delete(ru.removeConsumers, key)
  1927  					ru.updateConsumers[key] = ca
  1928  				} else {
  1929  					js.processConsumerAssignment(ca)
  1930  				}
  1931  			case assignCompressedConsumerOp:
  1932  				ca, err := decodeConsumerAssignmentCompressed(buf[1:])
  1933  				if err != nil {
  1934  					js.srv.Errorf("JetStream cluster failed to decode compressed consumer assignment: %q", buf[1:])
  1935  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1936  				}
  1937  				if isRecovering {
  1938  					js.setConsumerAssignmentRecovering(ca)
  1939  					key := ca.recoveryKey()
  1940  					delete(ru.removeConsumers, key)
  1941  					ru.updateConsumers[key] = ca
  1942  				} else {
  1943  					js.processConsumerAssignment(ca)
  1944  				}
  1945  			case removeConsumerOp:
  1946  				ca, err := decodeConsumerAssignment(buf[1:])
  1947  				if err != nil {
  1948  					js.srv.Errorf("JetStream cluster failed to decode consumer assignment: %q", buf[1:])
  1949  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1950  				}
  1951  				if isRecovering {
  1952  					js.setConsumerAssignmentRecovering(ca)
  1953  					key := ca.recoveryKey()
  1954  					ru.removeConsumers[key] = ca
  1955  					delete(ru.updateConsumers, key)
  1956  				} else {
  1957  					js.processConsumerRemoval(ca)
  1958  					didRemoveConsumer = true
  1959  				}
  1960  			case updateStreamOp:
  1961  				sa, err := decodeStreamAssignment(buf[1:])
  1962  				if err != nil {
  1963  					js.srv.Errorf("JetStream cluster failed to decode stream assignment: %q", buf[1:])
  1964  					return didSnap, didRemoveStream, didRemoveConsumer, err
  1965  				}
  1966  				if isRecovering {
  1967  					js.setStreamAssignmentRecovering(sa)
  1968  					key := sa.recoveryKey()
  1969  					ru.updateStreams[key] = sa
  1970  					delete(ru.removeStreams, key)
  1971  				} else {
  1972  					js.processUpdateStreamAssignment(sa)
  1973  					// Since an update can be lowering replica count, we want upper layer to treat
  1974  					// similar to a removal and snapshot to collapse old entries.
  1975  					didRemoveStream = true
  1976  				}
  1977  			default:
  1978  				panic(fmt.Sprintf("JetStream Cluster Unknown meta entry op type: %v", entryOp(buf[0])))
  1979  			}
  1980  		}
  1981  	}
  1982  	return didSnap, didRemoveStream, didRemoveConsumer, nil
  1983  }
  1984  
  1985  func (rg *raftGroup) isMember(id string) bool {
  1986  	if rg == nil {
  1987  		return false
  1988  	}
  1989  	for _, peer := range rg.Peers {
  1990  		if peer == id {
  1991  			return true
  1992  		}
  1993  	}
  1994  	return false
  1995  }
  1996  
  1997  func (rg *raftGroup) setPreferred() {
  1998  	if rg == nil || len(rg.Peers) == 0 {
  1999  		return
  2000  	}
  2001  	if len(rg.Peers) == 1 {
  2002  		rg.Preferred = rg.Peers[0]
  2003  	} else {
  2004  		// For now just randomly select a peer for the preferred.
  2005  		pi := rand.Int31n(int32(len(rg.Peers)))
  2006  		rg.Preferred = rg.Peers[pi]
  2007  	}
  2008  }
  2009  
  2010  // createRaftGroup is called to spin up this raft group if needed.
  2011  func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, storage StorageType, labels pprofLabels) error {
  2012  	js.mu.Lock()
  2013  	s, cc := js.srv, js.cluster
  2014  	if cc == nil || cc.meta == nil {
  2015  		js.mu.Unlock()
  2016  		return NewJSClusterNotActiveError()
  2017  	}
  2018  
  2019  	// If this is a single peer raft group or we are not a member return.
  2020  	if len(rg.Peers) <= 1 || !rg.isMember(cc.meta.ID()) {
  2021  		js.mu.Unlock()
  2022  		// Nothing to do here.
  2023  		return nil
  2024  	}
  2025  
  2026  	// Check if we already have this assigned.
  2027  	if node := s.lookupRaftNode(rg.Name); node != nil {
  2028  		s.Debugf("JetStream cluster already has raft group %q assigned", rg.Name)
  2029  		rg.node = node
  2030  		js.mu.Unlock()
  2031  		return nil
  2032  	}
  2033  
  2034  	s.Debugf("JetStream cluster creating raft group:%+v", rg)
  2035  	js.mu.Unlock()
  2036  
  2037  	sysAcc := s.SystemAccount()
  2038  	if sysAcc == nil {
  2039  		s.Debugf("JetStream cluster detected shutdown processing raft group: %+v", rg)
  2040  		return errors.New("shutting down")
  2041  	}
  2042  
  2043  	// Check here to see if we have a max HA Assets limit set.
  2044  	if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets > 0 {
  2045  		if s.numRaftNodes() > maxHaAssets {
  2046  			s.Warnf("Maximum HA Assets limit reached: %d", maxHaAssets)
  2047  			// Since the meta leader assigned this, send a statsz update to them to get them up to date.
  2048  			go s.sendStatszUpdate()
  2049  			return errors.New("system limit reached")
  2050  		}
  2051  	}
  2052  
  2053  	storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, rg.Name)
  2054  	var store StreamStore
  2055  	if storage == FileStorage {
  2056  		fs, err := newFileStoreWithCreated(
  2057  			FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncInterval: 5 * time.Minute, srv: s},
  2058  			StreamConfig{Name: rg.Name, Storage: FileStorage, Metadata: labels},
  2059  			time.Now().UTC(),
  2060  			s.jsKeyGen(s.getOpts().JetStreamKey, rg.Name),
  2061  			s.jsKeyGen(s.getOpts().JetStreamOldKey, rg.Name),
  2062  		)
  2063  		if err != nil {
  2064  			s.Errorf("Error creating filestore WAL: %v", err)
  2065  			return err
  2066  		}
  2067  		store = fs
  2068  	} else {
  2069  		ms, err := newMemStore(&StreamConfig{Name: rg.Name, Storage: MemoryStorage})
  2070  		if err != nil {
  2071  			s.Errorf("Error creating memstore WAL: %v", err)
  2072  			return err
  2073  		}
  2074  		store = ms
  2075  	}
  2076  
  2077  	cfg := &RaftConfig{Name: rg.Name, Store: storeDir, Log: store, Track: true}
  2078  
  2079  	if _, err := readPeerState(storeDir); err != nil {
  2080  		s.bootstrapRaftNode(cfg, rg.Peers, true)
  2081  	}
  2082  
  2083  	n, err := s.startRaftNode(accName, cfg, labels)
  2084  	if err != nil || n == nil {
  2085  		s.Debugf("Error creating raft group: %v", err)
  2086  		return err
  2087  	}
  2088  	// Need locking here for the assignment to avoid data-race reports
  2089  	js.mu.Lock()
  2090  	rg.node = n
  2091  	// See if we are preferred and should start campaign immediately.
  2092  	if n.ID() == rg.Preferred && n.Term() == 0 {
  2093  		n.Campaign()
  2094  	}
  2095  	js.mu.Unlock()
  2096  	return nil
  2097  }
  2098  
  2099  func (mset *stream) raftGroup() *raftGroup {
  2100  	if mset == nil {
  2101  		return nil
  2102  	}
  2103  	mset.mu.RLock()
  2104  	defer mset.mu.RUnlock()
  2105  	if mset.sa == nil {
  2106  		return nil
  2107  	}
  2108  	return mset.sa.Group
  2109  }
  2110  
  2111  func (mset *stream) raftNode() RaftNode {
  2112  	if mset == nil {
  2113  		return nil
  2114  	}
  2115  	mset.mu.RLock()
  2116  	defer mset.mu.RUnlock()
  2117  	return mset.node
  2118  }
  2119  
  2120  func (mset *stream) removeNode() {
  2121  	mset.mu.Lock()
  2122  	defer mset.mu.Unlock()
  2123  	if n := mset.node; n != nil {
  2124  		n.Delete()
  2125  		mset.node = nil
  2126  	}
  2127  }
  2128  
  2129  func (mset *stream) clearRaftNode() {
  2130  	if mset == nil {
  2131  		return
  2132  	}
  2133  	mset.mu.Lock()
  2134  	defer mset.mu.Unlock()
  2135  	mset.node = nil
  2136  }
  2137  
  2138  // Helper function to generate peer info.
  2139  // lists and sets for old and new.
  2140  func genPeerInfo(peers []string, split int) (newPeers, oldPeers []string, newPeerSet, oldPeerSet map[string]bool) {
  2141  	newPeers = peers[split:]
  2142  	oldPeers = peers[:split]
  2143  	newPeerSet = make(map[string]bool, len(newPeers))
  2144  	oldPeerSet = make(map[string]bool, len(oldPeers))
  2145  	for i, peer := range peers {
  2146  		if i < split {
  2147  			oldPeerSet[peer] = true
  2148  		} else {
  2149  			newPeerSet[peer] = true
  2150  		}
  2151  	}
  2152  	return
  2153  }
  2154  
  2155  // This will wait for a period of time until all consumers are registered and have
  2156  // their consumer assignments assigned.
  2157  // Should only be called from monitorStream.
  2158  func (mset *stream) waitOnConsumerAssignments() {
  2159  	mset.mu.RLock()
  2160  	s, js, acc, sa, name := mset.srv, mset.js, mset.acc, mset.sa, mset.cfg.Name
  2161  	mset.mu.RUnlock()
  2162  
  2163  	if s == nil || js == nil || acc == nil || sa == nil {
  2164  		return
  2165  	}
  2166  
  2167  	js.mu.RLock()
  2168  	numExpectedConsumers := len(sa.consumers)
  2169  	js.mu.RUnlock()
  2170  
  2171  	// Max to wait.
  2172  	const maxWaitTime = 10 * time.Second
  2173  	const sleepTime = 500 * time.Millisecond
  2174  
  2175  	// Wait up to 10s
  2176  	timeout := time.Now().Add(maxWaitTime)
  2177  	for time.Now().Before(timeout) {
  2178  		var numReady int
  2179  		for _, o := range mset.getConsumers() {
  2180  			// Make sure we are registered with our consumer assignment.
  2181  			if ca := o.consumerAssignment(); ca != nil {
  2182  				numReady++
  2183  			} else {
  2184  				break
  2185  			}
  2186  		}
  2187  		// Check if we are good.
  2188  		if numReady >= numExpectedConsumers {
  2189  			break
  2190  		}
  2191  
  2192  		s.Debugf("Waiting for consumers for interest based stream '%s > %s'", acc.Name, name)
  2193  		select {
  2194  		case <-s.quitCh:
  2195  			return
  2196  		case <-mset.monitorQuitC():
  2197  			return
  2198  		case <-time.After(sleepTime):
  2199  		}
  2200  	}
  2201  
  2202  	if actual := mset.numConsumers(); actual < numExpectedConsumers {
  2203  		s.Warnf("All consumers not online for '%s > %s': expected %d but only have %d", acc.Name, name, numExpectedConsumers, actual)
  2204  	}
  2205  }
  2206  
  2207  // Monitor our stream node for this stream.
  2208  func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnapshot bool) {
  2209  	s, cc := js.server(), js.cluster
  2210  	defer s.grWG.Done()
  2211  	if mset != nil {
  2212  		defer mset.monitorWg.Done()
  2213  	}
  2214  	js.mu.RLock()
  2215  	n := sa.Group.node
  2216  	meta := cc.meta
  2217  	js.mu.RUnlock()
  2218  
  2219  	if n == nil || meta == nil {
  2220  		s.Warnf("No RAFT group for '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name)
  2221  		return
  2222  	}
  2223  
  2224  	// Make sure only one is running.
  2225  	if mset != nil {
  2226  		if mset.checkInMonitor() {
  2227  			return
  2228  		}
  2229  		defer mset.clearMonitorRunning()
  2230  	}
  2231  
  2232  	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
  2233  	// This should be below the checkInMonitor call though to avoid stopping it out
  2234  	// from underneath the one that is running since it will be the same raft node.
  2235  	defer n.Stop()
  2236  
  2237  	qch, mqch, lch, aq, uch, ourPeerId := n.QuitC(), mset.monitorQuitC(), n.LeadChangeC(), n.ApplyQ(), mset.updateC(), meta.ID()
  2238  
  2239  	s.Debugf("Starting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group())
  2240  	defer s.Debugf("Exiting stream monitor for '%s > %s' [%s]", sa.Client.serviceAccount(), sa.Config.Name, n.Group())
  2241  
  2242  	// Make sure we do not leave the apply channel to fill up and block the raft layer.
  2243  	defer func() {
  2244  		if n.State() == Closed {
  2245  			return
  2246  		}
  2247  		if n.Leader() {
  2248  			n.StepDown()
  2249  		}
  2250  		// Drain the commit queue...
  2251  		aq.drain()
  2252  	}()
  2253  
  2254  	const (
  2255  		compactInterval = 2 * time.Minute
  2256  		compactSizeMin  = 8 * 1024 * 1024
  2257  		compactNumMin   = 65536
  2258  		minSnapDelta    = 10 * time.Second
  2259  	)
  2260  
  2261  	// Spread these out for large numbers on server restart.
  2262  	rci := time.Duration(rand.Int63n(int64(time.Minute)))
  2263  	t := time.NewTicker(compactInterval + rci)
  2264  	defer t.Stop()
  2265  
  2266  	js.mu.RLock()
  2267  	isLeader := cc.isStreamLeader(sa.Client.serviceAccount(), sa.Config.Name)
  2268  	isRestore := sa.Restore != nil
  2269  	js.mu.RUnlock()
  2270  
  2271  	acc, err := s.LookupAccount(sa.Client.serviceAccount())
  2272  	if err != nil {
  2273  		s.Warnf("Could not retrieve account for stream '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name)
  2274  		return
  2275  	}
  2276  	accName := acc.GetName()
  2277  
  2278  	// Used to represent how we can detect a changed state quickly and without representing
  2279  	// a complete and detailed state which could be costly in terms of memory, cpu and GC.
  2280  	// This only entails how many messages, and the first and last sequence of the stream.
  2281  	// This is all that is needed to detect a change, and we can get this from FilteredState()
  2282  	// with and empty filter.
  2283  	var lastState SimpleState
  2284  	var lastSnapTime time.Time
  2285  
  2286  	// Don't allow the upper layer to install snapshots until we have
  2287  	// fully recovered from disk.
  2288  	isRecovering := true
  2289  
  2290  	// Should only to be called from leader.
  2291  	doSnapshot := func() {
  2292  		if mset == nil || isRecovering || isRestore || time.Since(lastSnapTime) < minSnapDelta {
  2293  			return
  2294  		}
  2295  
  2296  		// Before we actually calculate the detailed state and encode it, let's check the
  2297  		// simple state to detect any changes.
  2298  		curState := mset.store.FilteredState(0, _EMPTY_)
  2299  
  2300  		// If the state hasn't changed but the log has gone way over
  2301  		// the compaction size then we will want to compact anyway.
  2302  		// This shouldn't happen for streams like it can for pull
  2303  		// consumers on idle streams but better to be safe than sorry!
  2304  		ne, nb := n.Size()
  2305  		if curState == lastState && ne < compactNumMin && nb < compactSizeMin {
  2306  			return
  2307  		}
  2308  
  2309  		if err := n.InstallSnapshot(mset.stateSnapshot()); err == nil {
  2310  			lastState, lastSnapTime = curState, time.Now()
  2311  		} else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning {
  2312  			s.RateLimitWarnf("Failed to install snapshot for '%s > %s' [%s]: %v", mset.acc.Name, mset.name(), n.Group(), err)
  2313  		}
  2314  	}
  2315  
  2316  	// We will establish a restoreDoneCh no matter what. Will never be triggered unless
  2317  	// we replace with the restore chan.
  2318  	restoreDoneCh := make(<-chan error)
  2319  
  2320  	// For migration tracking.
  2321  	var mmt *time.Ticker
  2322  	var mmtc <-chan time.Time
  2323  
  2324  	startMigrationMonitoring := func() {
  2325  		if mmt == nil {
  2326  			mmt = time.NewTicker(500 * time.Millisecond)
  2327  			mmtc = mmt.C
  2328  		}
  2329  	}
  2330  
  2331  	stopMigrationMonitoring := func() {
  2332  		if mmt != nil {
  2333  			mmt.Stop()
  2334  			mmt, mmtc = nil, nil
  2335  		}
  2336  	}
  2337  	defer stopMigrationMonitoring()
  2338  
  2339  	// This is to optionally track when we are ready as a non-leader for direct access participation.
  2340  	// Either direct or if we are a direct mirror, or both.
  2341  	var dat *time.Ticker
  2342  	var datc <-chan time.Time
  2343  
  2344  	startDirectAccessMonitoring := func() {
  2345  		if dat == nil {
  2346  			dat = time.NewTicker(2 * time.Second)
  2347  			datc = dat.C
  2348  		}
  2349  	}
  2350  
  2351  	stopDirectMonitoring := func() {
  2352  		if dat != nil {
  2353  			dat.Stop()
  2354  			dat, datc = nil, nil
  2355  		}
  2356  	}
  2357  	defer stopDirectMonitoring()
  2358  
  2359  	if mset != nil && mset.isInterestRetention() {
  2360  		// Wait on our consumers to be assigned and running before proceeding.
  2361  		// This can become important when a server has lots of assets
  2362  		// since we process streams first then consumers as an asset class.
  2363  		mset.waitOnConsumerAssignments()
  2364  	}
  2365  
  2366  	// This is triggered during a scale up from R1 to clustered mode. We need the new followers to catchup,
  2367  	// similar to how we trigger the catchup mechanism post a backup/restore.
  2368  	// We can arrive here NOT being the leader, so we send the snapshot only if we are, and in this case
  2369  	// reset the notion that we need to send the snapshot. If we are not, then the first time the server
  2370  	// will switch to leader (in the loop below), we will send the snapshot.
  2371  	if sendSnapshot && isLeader && mset != nil && n != nil && !isRecovering {
  2372  		n.SendSnapshot(mset.stateSnapshot())
  2373  		sendSnapshot = false
  2374  	}
  2375  
  2376  	for {
  2377  		select {
  2378  		case <-s.quitCh:
  2379  			return
  2380  		case <-mqch:
  2381  			return
  2382  		case <-qch:
  2383  			return
  2384  		case <-aq.ch:
  2385  			var ne, nb uint64
  2386  			ces := aq.pop()
  2387  			for _, ce := range ces {
  2388  				// No special processing needed for when we are caught up on restart.
  2389  				if ce == nil {
  2390  					isRecovering = false
  2391  					// If we are interest based make sure to check consumers if interest retention policy.
  2392  					// This is to make sure we process any outstanding acks from all consumers.
  2393  					mset.checkInterestState()
  2394  					// Make sure we create a new snapshot in case things have changed such that any existing
  2395  					// snapshot may no longer be valid.
  2396  					doSnapshot()
  2397  					// If we became leader during this time and we need to send a snapshot to our
  2398  					// followers, i.e. as a result of a scale-up from R1, do it now.
  2399  					if sendSnapshot && isLeader && mset != nil && n != nil {
  2400  						n.SendSnapshot(mset.stateSnapshot())
  2401  						sendSnapshot = false
  2402  					}
  2403  					continue
  2404  				}
  2405  				// Apply our entries.
  2406  				if err := js.applyStreamEntries(mset, ce, isRecovering); err == nil {
  2407  					// Update our applied.
  2408  					ne, nb = n.Applied(ce.Index)
  2409  					ce.ReturnToPool()
  2410  				} else {
  2411  					// Our stream was closed out from underneath of us, simply return here.
  2412  					if err == errStreamClosed {
  2413  						return
  2414  					}
  2415  					s.Warnf("Error applying entries to '%s > %s': %v", accName, sa.Config.Name, err)
  2416  					if isClusterResetErr(err) {
  2417  						if mset.isMirror() && mset.IsLeader() {
  2418  							mset.retryMirrorConsumer()
  2419  							continue
  2420  						}
  2421  						// We will attempt to reset our cluster state.
  2422  						if mset.resetClusteredState(err) {
  2423  							aq.recycle(&ces)
  2424  							return
  2425  						}
  2426  					} else if isOutOfSpaceErr(err) {
  2427  						// If applicable this will tear all of this down, but don't assume so and return.
  2428  						s.handleOutOfSpace(mset)
  2429  					}
  2430  				}
  2431  			}
  2432  			aq.recycle(&ces)
  2433  
  2434  			// Check about snapshotting
  2435  			// If we have at least min entries to compact, go ahead and try to snapshot/compact.
  2436  			if ne >= compactNumMin || nb > compactSizeMin {
  2437  				doSnapshot()
  2438  			}
  2439  
  2440  		case isLeader = <-lch:
  2441  			if isLeader {
  2442  				if mset != nil && n != nil && sendSnapshot && !isRecovering {
  2443  					// If we *are* recovering at the time then this will get done when the apply queue
  2444  					// handles the nil guard to show the catchup ended.
  2445  					n.SendSnapshot(mset.stateSnapshot())
  2446  					sendSnapshot = false
  2447  				}
  2448  				if isRestore {
  2449  					acc, _ := s.LookupAccount(sa.Client.serviceAccount())
  2450  					restoreDoneCh = s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_)
  2451  					continue
  2452  				} else if n != nil && n.NeedSnapshot() {
  2453  					doSnapshot()
  2454  				}
  2455  				// Always cancel if this was running.
  2456  				stopDirectMonitoring()
  2457  
  2458  			} else if n.GroupLeader() != noLeader {
  2459  				js.setStreamAssignmentRecovering(sa)
  2460  			}
  2461  
  2462  			// Process our leader change.
  2463  			js.processStreamLeaderChange(mset, isLeader)
  2464  
  2465  			// We may receive a leader change after the stream assignment which would cancel us
  2466  			// monitoring for this closely. So re-assess our state here as well.
  2467  			// Or the old leader is no longer part of the set and transferred leadership
  2468  			// for this leader to resume with removal
  2469  			migrating := mset.isMigrating()
  2470  
  2471  			// Check for migrations here. We set the state on the stream assignment update below.
  2472  			if isLeader && migrating {
  2473  				startMigrationMonitoring()
  2474  			}
  2475  
  2476  			// Here we are checking if we are not the leader but we have been asked to allow
  2477  			// direct access. We now allow non-leaders to participate in the queue group.
  2478  			if !isLeader && mset != nil {
  2479  				mset.mu.RLock()
  2480  				ad, md := mset.cfg.AllowDirect, mset.cfg.MirrorDirect
  2481  				mset.mu.RUnlock()
  2482  				if ad || md {
  2483  					startDirectAccessMonitoring()
  2484  				}
  2485  			}
  2486  
  2487  		case <-datc:
  2488  			if mset == nil || isRecovering {
  2489  				continue
  2490  			}
  2491  			// If we are leader we can stop, we know this is setup now.
  2492  			if isLeader {
  2493  				stopDirectMonitoring()
  2494  				continue
  2495  			}
  2496  
  2497  			mset.mu.Lock()
  2498  			ad, md, current := mset.cfg.AllowDirect, mset.cfg.MirrorDirect, mset.isCurrent()
  2499  			if !current {
  2500  				const syncThreshold = 90.0
  2501  				// We are not current, but current means exactly caught up. Under heavy publish
  2502  				// loads we may never reach this, so check if we are within 90% caught up.
  2503  				_, c, a := mset.node.Progress()
  2504  				if c == 0 {
  2505  					mset.mu.Unlock()
  2506  					continue
  2507  				}
  2508  				if p := float64(a) / float64(c) * 100.0; p < syncThreshold {
  2509  					mset.mu.Unlock()
  2510  					continue
  2511  				} else {
  2512  					s.Debugf("Stream '%s > %s' enabling direct gets at %.0f%% synchronized",
  2513  						sa.Client.serviceAccount(), sa.Config.Name, p)
  2514  				}
  2515  			}
  2516  			// We are current, cancel monitoring and create the direct subs as needed.
  2517  			if ad {
  2518  				mset.subscribeToDirect()
  2519  			}
  2520  			if md {
  2521  				mset.subscribeToMirrorDirect()
  2522  			}
  2523  			mset.mu.Unlock()
  2524  			// Stop direct monitoring.
  2525  			stopDirectMonitoring()
  2526  
  2527  		case <-t.C:
  2528  			doSnapshot()
  2529  
  2530  		case <-uch:
  2531  			// keep stream assignment current
  2532  			sa = mset.streamAssignment()
  2533  
  2534  			// keep peer list up to date with config
  2535  			js.checkPeers(mset.raftGroup())
  2536  			// We get this when we have a new stream assignment caused by an update.
  2537  			// We want to know if we are migrating.
  2538  			if migrating := mset.isMigrating(); migrating {
  2539  				if isLeader && mmtc == nil {
  2540  					startMigrationMonitoring()
  2541  				}
  2542  			} else {
  2543  				stopMigrationMonitoring()
  2544  			}
  2545  		case <-mmtc:
  2546  			if !isLeader {
  2547  				// We are no longer leader, so not our job.
  2548  				stopMigrationMonitoring()
  2549  				continue
  2550  			}
  2551  
  2552  			// Check to see where we are..
  2553  			rg := mset.raftGroup()
  2554  
  2555  			// Track the new peers and check the ones that are current.
  2556  			mset.mu.RLock()
  2557  			replicas := mset.cfg.Replicas
  2558  			mset.mu.RUnlock()
  2559  			if len(rg.Peers) <= replicas {
  2560  				// Migration no longer happening, so not our job anymore
  2561  				stopMigrationMonitoring()
  2562  				continue
  2563  			}
  2564  
  2565  			// Make sure we have correct cluster information on the other peers.
  2566  			ci := js.clusterInfo(rg)
  2567  			mset.checkClusterInfo(ci)
  2568  
  2569  			newPeers, oldPeers, newPeerSet, oldPeerSet := genPeerInfo(rg.Peers, len(rg.Peers)-replicas)
  2570  
  2571  			// If we are part of the new peerset and we have been passed the baton.
  2572  			// We will handle scale down.
  2573  			if newPeerSet[ourPeerId] {
  2574  				// First need to check on any consumers and make sure they have moved properly before scaling down ourselves.
  2575  				js.mu.RLock()
  2576  				var needToWait bool
  2577  				for name, c := range sa.consumers {
  2578  					for _, peer := range c.Group.Peers {
  2579  						// If we have peers still in the old set block.
  2580  						if oldPeerSet[peer] {
  2581  							s.Debugf("Scale down of '%s > %s' blocked by consumer '%s'", accName, sa.Config.Name, name)
  2582  							needToWait = true
  2583  							break
  2584  						}
  2585  					}
  2586  					if needToWait {
  2587  						break
  2588  					}
  2589  				}
  2590  				js.mu.RUnlock()
  2591  				if needToWait {
  2592  					continue
  2593  				}
  2594  
  2595  				// We are good to go, can scale down here.
  2596  				for _, p := range oldPeers {
  2597  					n.ProposeRemovePeer(p)
  2598  				}
  2599  
  2600  				csa := sa.copyGroup()
  2601  				csa.Group.Peers = newPeers
  2602  				csa.Group.Preferred = ourPeerId
  2603  				csa.Group.Cluster = s.cachedClusterName()
  2604  				cc.meta.ForwardProposal(encodeUpdateStreamAssignment(csa))
  2605  				s.Noticef("Scaling down '%s > %s' to %+v", accName, sa.Config.Name, s.peerSetToNames(newPeers))
  2606  			} else {
  2607  				// We are the old leader here, from the original peer set.
  2608  				// We are simply waiting on the new peerset to be caught up so we can transfer leadership.
  2609  				var newLeaderPeer, newLeader string
  2610  				neededCurrent, current := replicas/2+1, 0
  2611  
  2612  				for _, r := range ci.Replicas {
  2613  					if r.Current && newPeerSet[r.Peer] {
  2614  						current++
  2615  						if newLeader == _EMPTY_ {
  2616  							newLeaderPeer, newLeader = r.Peer, r.Name
  2617  						}
  2618  					}
  2619  				}
  2620  				// Check if we have a quorom.
  2621  				if current >= neededCurrent {
  2622  					s.Noticef("Transfer of stream leader for '%s > %s' to '%s'", accName, sa.Config.Name, newLeader)
  2623  					n.UpdateKnownPeers(newPeers)
  2624  					n.StepDown(newLeaderPeer)
  2625  				}
  2626  			}
  2627  
  2628  		case err := <-restoreDoneCh:
  2629  			// We have completed a restore from snapshot on this server. The stream assignment has
  2630  			// already been assigned but the replicas will need to catch up out of band. Consumers
  2631  			// will need to be assigned by forwarding the proposal and stamping the initial state.
  2632  			s.Debugf("Stream restore for '%s > %s' completed", sa.Client.serviceAccount(), sa.Config.Name)
  2633  			if err != nil {
  2634  				s.Debugf("Stream restore failed: %v", err)
  2635  			}
  2636  			isRestore = false
  2637  			sa.Restore = nil
  2638  			// If we were successful lookup up our stream now.
  2639  			if err == nil {
  2640  				if mset, err = acc.lookupStream(sa.Config.Name); mset != nil {
  2641  					mset.monitorWg.Add(1)
  2642  					defer mset.monitorWg.Done()
  2643  					mset.setStreamAssignment(sa)
  2644  					// Make sure to update our updateC which would have been nil.
  2645  					uch = mset.updateC()
  2646  					// Also update our mqch
  2647  					mqch = mset.monitorQuitC()
  2648  				}
  2649  			}
  2650  			if err != nil {
  2651  				if mset != nil {
  2652  					mset.delete()
  2653  				}
  2654  				js.mu.Lock()
  2655  				sa.err = err
  2656  				if n != nil {
  2657  					n.Delete()
  2658  				}
  2659  				result := &streamAssignmentResult{
  2660  					Account: sa.Client.serviceAccount(),
  2661  					Stream:  sa.Config.Name,
  2662  					Restore: &JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}},
  2663  				}
  2664  				result.Restore.Error = NewJSStreamAssignmentError(err, Unless(err))
  2665  				js.mu.Unlock()
  2666  				// Send response to the metadata leader. They will forward to the user as needed.
  2667  				s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result)
  2668  				return
  2669  			}
  2670  
  2671  			if !isLeader {
  2672  				panic("Finished restore but not leader")
  2673  			}
  2674  			// Trigger the stream followers to catchup.
  2675  			if n = mset.raftNode(); n != nil {
  2676  				n.SendSnapshot(mset.stateSnapshot())
  2677  			}
  2678  			js.processStreamLeaderChange(mset, isLeader)
  2679  
  2680  			// Check to see if we have restored consumers here.
  2681  			// These are not currently assigned so we will need to do so here.
  2682  			if consumers := mset.getPublicConsumers(); len(consumers) > 0 {
  2683  				for _, o := range consumers {
  2684  					name, cfg := o.String(), o.config()
  2685  					rg := cc.createGroupForConsumer(&cfg, sa)
  2686  					// Pick a preferred leader.
  2687  					rg.setPreferred()
  2688  
  2689  					// Place our initial state here as well for assignment distribution.
  2690  					state, _ := o.store.State()
  2691  					ca := &consumerAssignment{
  2692  						Group:   rg,
  2693  						Stream:  sa.Config.Name,
  2694  						Name:    name,
  2695  						Config:  &cfg,
  2696  						Client:  sa.Client,
  2697  						Created: o.createdTime(),
  2698  						State:   state,
  2699  					}
  2700  
  2701  					// We make these compressed in case state is complex.
  2702  					addEntry := encodeAddConsumerAssignmentCompressed(ca)
  2703  					cc.meta.ForwardProposal(addEntry)
  2704  
  2705  					// Check to make sure we see the assignment.
  2706  					go func() {
  2707  						ticker := time.NewTicker(time.Second)
  2708  						defer ticker.Stop()
  2709  						for range ticker.C {
  2710  							js.mu.RLock()
  2711  							ca, meta := js.consumerAssignment(ca.Client.serviceAccount(), sa.Config.Name, name), cc.meta
  2712  							js.mu.RUnlock()
  2713  							if ca == nil {
  2714  								s.Warnf("Consumer assignment has not been assigned, retrying")
  2715  								if meta != nil {
  2716  									meta.ForwardProposal(addEntry)
  2717  								} else {
  2718  									return
  2719  								}
  2720  							} else {
  2721  								return
  2722  							}
  2723  						}
  2724  					}()
  2725  				}
  2726  			}
  2727  		}
  2728  	}
  2729  }
  2730  
  2731  // Determine if we are migrating
  2732  func (mset *stream) isMigrating() bool {
  2733  	if mset == nil {
  2734  		return false
  2735  	}
  2736  
  2737  	mset.mu.RLock()
  2738  	js, sa := mset.js, mset.sa
  2739  	mset.mu.RUnlock()
  2740  
  2741  	js.mu.RLock()
  2742  	defer js.mu.RUnlock()
  2743  
  2744  	// During migration we will always be R>1, even when we start R1.
  2745  	// So if we do not have a group or node we no we are not migrating.
  2746  	if sa == nil || sa.Group == nil || sa.Group.node == nil {
  2747  		return false
  2748  	}
  2749  	// The sign of migration is if our group peer count != configured replica count.
  2750  	if sa.Config.Replicas == len(sa.Group.Peers) {
  2751  		return false
  2752  	}
  2753  	return true
  2754  }
  2755  
  2756  // resetClusteredState is called when a clustered stream had an error (e.g sequence mismatch, bad snapshot) and needs to be reset.
  2757  func (mset *stream) resetClusteredState(err error) bool {
  2758  	mset.mu.RLock()
  2759  	s, js, jsa, sa, acc, node := mset.srv, mset.js, mset.jsa, mset.sa, mset.acc, mset.node
  2760  	stype, isLeader, tierName, replicas := mset.cfg.Storage, mset.isLeader(), mset.tier, mset.cfg.Replicas
  2761  	mset.mu.RUnlock()
  2762  
  2763  	// Stepdown regardless if we are the leader here.
  2764  	if isLeader && node != nil {
  2765  		node.StepDown()
  2766  	}
  2767  
  2768  	// If we detect we are shutting down just return.
  2769  	if js != nil && js.isShuttingDown() {
  2770  		s.Debugf("Will not reset stream, jetstream shutting down")
  2771  		return false
  2772  	}
  2773  
  2774  	// Server
  2775  	if js.limitsExceeded(stype) {
  2776  		s.Warnf("Will not reset stream, server resources exceeded")
  2777  		return false
  2778  	}
  2779  
  2780  	// Account
  2781  	if exceeded, _ := jsa.limitsExceeded(stype, tierName, replicas); exceeded {
  2782  		s.Warnf("stream '%s > %s' errored, account resources exceeded", acc, mset.name())
  2783  		return false
  2784  	}
  2785  
  2786  	// We delete our raft state. Will recreate.
  2787  	if node != nil {
  2788  		node.Delete()
  2789  	}
  2790  
  2791  	// Preserve our current state and messages unless we have a first sequence mismatch.
  2792  	shouldDelete := err == errFirstSequenceMismatch
  2793  
  2794  	// Need to do the rest in a separate Go routine.
  2795  	go func() {
  2796  		mset.monitorWg.Wait()
  2797  		mset.resetAndWaitOnConsumers()
  2798  		// Stop our stream.
  2799  		mset.stop(shouldDelete, false)
  2800  
  2801  		if sa != nil {
  2802  			js.mu.Lock()
  2803  			if js.shuttingDown {
  2804  				js.mu.Unlock()
  2805  				return
  2806  			}
  2807  
  2808  			s.Warnf("Resetting stream cluster state for '%s > %s'", sa.Client.serviceAccount(), sa.Config.Name)
  2809  			// Now wipe groups from assignments.
  2810  			sa.Group.node = nil
  2811  			var consumers []*consumerAssignment
  2812  			if cc := js.cluster; cc != nil && cc.meta != nil {
  2813  				ourID := cc.meta.ID()
  2814  				for _, ca := range sa.consumers {
  2815  					if rg := ca.Group; rg != nil && rg.isMember(ourID) {
  2816  						rg.node = nil // Erase group raft/node state.
  2817  						consumers = append(consumers, ca)
  2818  					}
  2819  				}
  2820  			}
  2821  			js.mu.Unlock()
  2822  
  2823  			// This will reset the stream and consumers.
  2824  			// Reset stream.
  2825  			js.processClusterCreateStream(acc, sa)
  2826  			// Reset consumers.
  2827  			for _, ca := range consumers {
  2828  				js.processClusterCreateConsumer(ca, nil, false)
  2829  			}
  2830  		}
  2831  	}()
  2832  
  2833  	return true
  2834  }
  2835  
  2836  func isControlHdr(hdr []byte) bool {
  2837  	return bytes.HasPrefix(hdr, []byte("NATS/1.0 100 "))
  2838  }
  2839  
  2840  // Apply our stream entries.
  2841  func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isRecovering bool) error {
  2842  	for _, e := range ce.Entries {
  2843  		if e.Type == EntryNormal {
  2844  			buf, op := e.Data, entryOp(e.Data[0])
  2845  			switch op {
  2846  			case streamMsgOp, compressedStreamMsgOp:
  2847  				if mset == nil {
  2848  					continue
  2849  				}
  2850  				s := js.srv
  2851  
  2852  				mbuf := buf[1:]
  2853  				if op == compressedStreamMsgOp {
  2854  					var err error
  2855  					mbuf, err = s2.Decode(nil, mbuf)
  2856  					if err != nil {
  2857  						panic(err.Error())
  2858  					}
  2859  				}
  2860  
  2861  				subject, reply, hdr, msg, lseq, ts, err := decodeStreamMsg(mbuf)
  2862  				if err != nil {
  2863  					if node := mset.raftNode(); node != nil {
  2864  						s.Errorf("JetStream cluster could not decode stream msg for '%s > %s' [%s]",
  2865  							mset.account(), mset.name(), node.Group())
  2866  					}
  2867  					panic(err.Error())
  2868  				}
  2869  
  2870  				// Check for flowcontrol here.
  2871  				if len(msg) == 0 && len(hdr) > 0 && reply != _EMPTY_ && isControlHdr(hdr) {
  2872  					if !isRecovering {
  2873  						mset.sendFlowControlReply(reply)
  2874  					}
  2875  					continue
  2876  				}
  2877  
  2878  				// Grab last sequence and CLFS.
  2879  				last, clfs := mset.lastSeqAndCLFS()
  2880  
  2881  				// We can skip if we know this is less than what we already have.
  2882  				if lseq-clfs < last {
  2883  					s.Debugf("Apply stream entries for '%s > %s' skipping message with sequence %d with last of %d",
  2884  						mset.account(), mset.name(), lseq+1-clfs, last)
  2885  					mset.mu.Lock()
  2886  					// Check for any preAcks in case we are interest based.
  2887  					mset.clearAllPreAcks(lseq + 1 - clfs)
  2888  					mset.mu.Unlock()
  2889  					continue
  2890  				}
  2891  
  2892  				// Skip by hand here since first msg special case.
  2893  				// Reason is sequence is unsigned and for lseq being 0
  2894  				// the lseq under stream would have to be -1.
  2895  				if lseq == 0 && last != 0 {
  2896  					continue
  2897  				}
  2898  
  2899  				// Messages to be skipped have no subject or timestamp or msg or hdr.
  2900  				if subject == _EMPTY_ && ts == 0 && len(msg) == 0 && len(hdr) == 0 {
  2901  					// Skip and update our lseq.
  2902  					last := mset.store.SkipMsg()
  2903  					mset.setLastSeq(last)
  2904  					mset.clearAllPreAcks(last)
  2905  					continue
  2906  				}
  2907  
  2908  				var mt *msgTrace
  2909  				// If not recovering, see if we find a message trace object for this
  2910  				// sequence. Only the leader that has proposed this entry will have
  2911  				// stored the trace info.
  2912  				if !isRecovering {
  2913  					mt = mset.getAndDeleteMsgTrace(lseq)
  2914  				}
  2915  				// Process the actual message here.
  2916  				err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt)
  2917  
  2918  				// If we have inflight make sure to clear after processing.
  2919  				// TODO(dlc) - technically check on inflight != nil could cause datarace.
  2920  				// But do not want to acquire lock since tracking this will be rare.
  2921  				if mset.inflight != nil {
  2922  					mset.clMu.Lock()
  2923  					delete(mset.inflight, lseq)
  2924  					mset.clMu.Unlock()
  2925  				}
  2926  
  2927  				if err != nil {
  2928  					if err == errLastSeqMismatch {
  2929  						var state StreamState
  2930  						mset.store.FastState(&state)
  2931  
  2932  						// If we have no msgs and the other side is delivering us a sequence past where we
  2933  						// should be reset. This is possible if the other side has a stale snapshot and no longer
  2934  						// has those messages. So compact and retry to reset.
  2935  						if state.Msgs == 0 {
  2936  							mset.store.Compact(lseq + 1)
  2937  							// Retry
  2938  							err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt)
  2939  						}
  2940  					}
  2941  
  2942  					// Only return in place if we are going to reset our stream or we are out of space, or we are closed.
  2943  					if isClusterResetErr(err) || isOutOfSpaceErr(err) || err == errStreamClosed {
  2944  						return err
  2945  					}
  2946  					s.Debugf("Apply stream entries for '%s > %s' got error processing message: %v",
  2947  						mset.account(), mset.name(), err)
  2948  				}
  2949  
  2950  			case deleteMsgOp:
  2951  				md, err := decodeMsgDelete(buf[1:])
  2952  				if err != nil {
  2953  					if node := mset.raftNode(); node != nil {
  2954  						s := js.srv
  2955  						s.Errorf("JetStream cluster could not decode delete msg for '%s > %s' [%s]",
  2956  							mset.account(), mset.name(), node.Group())
  2957  					}
  2958  					panic(err.Error())
  2959  				}
  2960  				s, cc := js.server(), js.cluster
  2961  
  2962  				var removed bool
  2963  				if md.NoErase {
  2964  					removed, err = mset.removeMsg(md.Seq)
  2965  				} else {
  2966  					removed, err = mset.eraseMsg(md.Seq)
  2967  				}
  2968  
  2969  				// Cluster reset error.
  2970  				if err == ErrStoreEOF {
  2971  					return err
  2972  				}
  2973  
  2974  				if err != nil && !isRecovering {
  2975  					s.Debugf("JetStream cluster failed to delete stream msg %d from '%s > %s': %v",
  2976  						md.Seq, md.Client.serviceAccount(), md.Stream, err)
  2977  				}
  2978  
  2979  				js.mu.RLock()
  2980  				isLeader := cc.isStreamLeader(md.Client.serviceAccount(), md.Stream)
  2981  				js.mu.RUnlock()
  2982  
  2983  				if isLeader && !isRecovering {
  2984  					var resp = JSApiMsgDeleteResponse{ApiResponse: ApiResponse{Type: JSApiMsgDeleteResponseType}}
  2985  					if err != nil {
  2986  						resp.Error = NewJSStreamMsgDeleteFailedError(err, Unless(err))
  2987  						s.sendAPIErrResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp))
  2988  					} else if !removed {
  2989  						resp.Error = NewJSSequenceNotFoundError(md.Seq)
  2990  						s.sendAPIErrResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp))
  2991  					} else {
  2992  						resp.Success = true
  2993  						s.sendAPIResponse(md.Client, mset.account(), md.Subject, md.Reply, _EMPTY_, s.jsonResponse(resp))
  2994  					}
  2995  				}
  2996  			case purgeStreamOp:
  2997  				sp, err := decodeStreamPurge(buf[1:])
  2998  				if err != nil {
  2999  					if node := mset.raftNode(); node != nil {
  3000  						s := js.srv
  3001  						s.Errorf("JetStream cluster could not decode purge msg for '%s > %s' [%s]",
  3002  							mset.account(), mset.name(), node.Group())
  3003  					}
  3004  					panic(err.Error())
  3005  				}
  3006  				// If no explicit request, fill in with leader stamped last sequence to protect ourselves on replay during server start.
  3007  				if sp.Request == nil || sp.Request.Sequence == 0 {
  3008  					purgeSeq := sp.LastSeq + 1
  3009  					if sp.Request == nil {
  3010  						sp.Request = &JSApiStreamPurgeRequest{Sequence: purgeSeq}
  3011  					} else if sp.Request.Keep == 0 {
  3012  						sp.Request.Sequence = purgeSeq
  3013  					} else if isRecovering {
  3014  						continue
  3015  					}
  3016  				}
  3017  
  3018  				s := js.server()
  3019  				purged, err := mset.purge(sp.Request)
  3020  				if err != nil {
  3021  					s.Warnf("JetStream cluster failed to purge stream %q for account %q: %v", sp.Stream, sp.Client.serviceAccount(), err)
  3022  				}
  3023  
  3024  				js.mu.RLock()
  3025  				isLeader := js.cluster.isStreamLeader(sp.Client.serviceAccount(), sp.Stream)
  3026  				js.mu.RUnlock()
  3027  
  3028  				if isLeader && !isRecovering {
  3029  					var resp = JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}}
  3030  					if err != nil {
  3031  						resp.Error = NewJSStreamGeneralError(err, Unless(err))
  3032  						s.sendAPIErrResponse(sp.Client, mset.account(), sp.Subject, sp.Reply, _EMPTY_, s.jsonResponse(resp))
  3033  					} else {
  3034  						resp.Purged = purged
  3035  						resp.Success = true
  3036  						s.sendAPIResponse(sp.Client, mset.account(), sp.Subject, sp.Reply, _EMPTY_, s.jsonResponse(resp))
  3037  					}
  3038  				}
  3039  			default:
  3040  				panic(fmt.Sprintf("JetStream Cluster Unknown group entry op type: %v", op))
  3041  			}
  3042  		} else if e.Type == EntrySnapshot {
  3043  			if mset == nil {
  3044  				return nil
  3045  			}
  3046  
  3047  			// Everything operates on new replicated state. Will convert legacy snapshots to this for processing.
  3048  			var ss *StreamReplicatedState
  3049  
  3050  			onBadState := func(err error) {
  3051  				// If we are the leader or recovering, meaning we own the snapshot,
  3052  				// we should stepdown and clear our raft state since our snapshot is bad.
  3053  				if isRecovering || mset.IsLeader() {
  3054  					mset.mu.RLock()
  3055  					s, accName, streamName := mset.srv, mset.acc.GetName(), mset.cfg.Name
  3056  					mset.mu.RUnlock()
  3057  					s.Warnf("Detected bad stream state, resetting '%s > %s'", accName, streamName)
  3058  					mset.resetClusteredState(err)
  3059  				}
  3060  			}
  3061  
  3062  			// Check if we are the new binary encoding.
  3063  			if IsEncodedStreamState(e.Data) {
  3064  				var err error
  3065  				ss, err = DecodeStreamState(e.Data)
  3066  				if err != nil {
  3067  					onBadState(err)
  3068  					return err
  3069  				}
  3070  			} else {
  3071  				var snap streamSnapshot
  3072  				if err := json.Unmarshal(e.Data, &snap); err != nil {
  3073  					onBadState(err)
  3074  					return err
  3075  				}
  3076  				// Convert over to StreamReplicatedState
  3077  				ss = &StreamReplicatedState{
  3078  					Msgs:     snap.Msgs,
  3079  					Bytes:    snap.Bytes,
  3080  					FirstSeq: snap.FirstSeq,
  3081  					LastSeq:  snap.LastSeq,
  3082  					Failed:   snap.Failed,
  3083  				}
  3084  				if len(snap.Deleted) > 0 {
  3085  					ss.Deleted = append(ss.Deleted, DeleteSlice(snap.Deleted))
  3086  				}
  3087  			}
  3088  
  3089  			if !isRecovering && !mset.IsLeader() {
  3090  				if err := mset.processSnapshot(ss); err != nil {
  3091  					return err
  3092  				}
  3093  			} else if isRecovering {
  3094  				// On recovery, reset CLFS/FAILED.
  3095  				mset.setCLFS(ss.Failed)
  3096  			}
  3097  		} else if e.Type == EntryRemovePeer {
  3098  			js.mu.RLock()
  3099  			var ourID string
  3100  			if js.cluster != nil && js.cluster.meta != nil {
  3101  				ourID = js.cluster.meta.ID()
  3102  			}
  3103  			js.mu.RUnlock()
  3104  			// We only need to do processing if this is us.
  3105  			if peer := string(e.Data); peer == ourID && mset != nil {
  3106  				// Double check here with the registered stream assignment.
  3107  				shouldRemove := true
  3108  				if sa := mset.streamAssignment(); sa != nil && sa.Group != nil {
  3109  					js.mu.RLock()
  3110  					shouldRemove = !sa.Group.isMember(ourID)
  3111  					js.mu.RUnlock()
  3112  				}
  3113  				if shouldRemove {
  3114  					mset.stop(true, false)
  3115  				}
  3116  			}
  3117  			return nil
  3118  		}
  3119  	}
  3120  	return nil
  3121  }
  3122  
  3123  // Returns the PeerInfo for all replicas of a raft node. This is different than node.Peers()
  3124  // and is used for external facing advisories.
  3125  func (s *Server) replicas(node RaftNode) []*PeerInfo {
  3126  	now := time.Now()
  3127  	var replicas []*PeerInfo
  3128  	for _, rp := range node.Peers() {
  3129  		if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil {
  3130  			si := sir.(nodeInfo)
  3131  			pi := &PeerInfo{Peer: rp.ID, Name: si.name, Current: rp.Current, Active: now.Sub(rp.Last), Offline: si.offline, Lag: rp.Lag}
  3132  			replicas = append(replicas, pi)
  3133  		}
  3134  	}
  3135  	return replicas
  3136  }
  3137  
  3138  // Will check our node peers and see if we should remove a peer.
  3139  func (js *jetStream) checkPeers(rg *raftGroup) {
  3140  	js.mu.Lock()
  3141  	defer js.mu.Unlock()
  3142  
  3143  	// FIXME(dlc) - Single replicas?
  3144  	if rg == nil || rg.node == nil {
  3145  		return
  3146  	}
  3147  	for _, peer := range rg.node.Peers() {
  3148  		if !rg.isMember(peer.ID) {
  3149  			rg.node.ProposeRemovePeer(peer.ID)
  3150  		}
  3151  	}
  3152  }
  3153  
  3154  // Process a leader change for the clustered stream.
  3155  func (js *jetStream) processStreamLeaderChange(mset *stream, isLeader bool) {
  3156  	if mset == nil {
  3157  		return
  3158  	}
  3159  	sa := mset.streamAssignment()
  3160  	if sa == nil {
  3161  		return
  3162  	}
  3163  
  3164  	// Clear inflight if we have it.
  3165  	mset.clMu.Lock()
  3166  	mset.inflight = nil
  3167  	mset.clMu.Unlock()
  3168  
  3169  	js.mu.Lock()
  3170  	s, account, err := js.srv, sa.Client.serviceAccount(), sa.err
  3171  	client, subject, reply := sa.Client, sa.Subject, sa.Reply
  3172  	hasResponded := sa.responded
  3173  	sa.responded = true
  3174  	peers := copyStrings(sa.Group.Peers)
  3175  	js.mu.Unlock()
  3176  
  3177  	streamName := mset.name()
  3178  
  3179  	if isLeader {
  3180  		s.Noticef("JetStream cluster new stream leader for '%s > %s'", account, streamName)
  3181  		s.sendStreamLeaderElectAdvisory(mset)
  3182  		// Check for peer removal and process here if needed.
  3183  		js.checkPeers(sa.Group)
  3184  		mset.checkAllowMsgCompress(peers)
  3185  	} else {
  3186  		// We are stepping down.
  3187  		// Make sure if we are doing so because we have lost quorum that we send the appropriate advisories.
  3188  		if node := mset.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second {
  3189  			s.sendStreamLostQuorumAdvisory(mset)
  3190  		}
  3191  
  3192  		// Clear clseq. If we become leader again, it will be fixed up
  3193  		// automatically on the next processClusteredInboundMsg call.
  3194  		mset.clMu.Lock()
  3195  		if mset.clseq > 0 {
  3196  			mset.clseq = 0
  3197  		}
  3198  		mset.clMu.Unlock()
  3199  	}
  3200  
  3201  	// Tell stream to switch leader status.
  3202  	mset.setLeader(isLeader)
  3203  
  3204  	if !isLeader || hasResponded {
  3205  		return
  3206  	}
  3207  
  3208  	acc, _ := s.LookupAccount(account)
  3209  	if acc == nil {
  3210  		return
  3211  	}
  3212  
  3213  	// Send our response.
  3214  	var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}
  3215  	if err != nil {
  3216  		resp.Error = NewJSStreamCreateError(err, Unless(err))
  3217  		s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  3218  	} else {
  3219  		resp.StreamInfo = &StreamInfo{
  3220  			Created:   mset.createdTime(),
  3221  			State:     mset.state(),
  3222  			Config:    mset.config(),
  3223  			Cluster:   js.clusterInfo(mset.raftGroup()),
  3224  			Sources:   mset.sourcesInfo(),
  3225  			Mirror:    mset.mirrorInfo(),
  3226  			TimeStamp: time.Now().UTC(),
  3227  		}
  3228  		resp.DidCreate = true
  3229  		s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  3230  		if node := mset.raftNode(); node != nil {
  3231  			mset.sendCreateAdvisory()
  3232  		}
  3233  	}
  3234  }
  3235  
  3236  // Fixed value ok for now.
  3237  const lostQuorumAdvInterval = 10 * time.Second
  3238  
  3239  // Determines if we should send lost quorum advisory. We throttle these after first one.
  3240  func (mset *stream) shouldSendLostQuorum() bool {
  3241  	mset.mu.Lock()
  3242  	defer mset.mu.Unlock()
  3243  	if time.Since(mset.lqsent) >= lostQuorumAdvInterval {
  3244  		mset.lqsent = time.Now()
  3245  		return true
  3246  	}
  3247  	return false
  3248  }
  3249  
  3250  func (s *Server) sendStreamLostQuorumAdvisory(mset *stream) {
  3251  	if mset == nil {
  3252  		return
  3253  	}
  3254  	node, stream, acc := mset.raftNode(), mset.name(), mset.account()
  3255  	if node == nil {
  3256  		return
  3257  	}
  3258  	if !mset.shouldSendLostQuorum() {
  3259  		return
  3260  	}
  3261  
  3262  	s.Warnf("JetStream cluster stream '%s > %s' has NO quorum, stalled", acc.GetName(), stream)
  3263  
  3264  	subj := JSAdvisoryStreamQuorumLostPre + "." + stream
  3265  	adv := &JSStreamQuorumLostAdvisory{
  3266  		TypedEvent: TypedEvent{
  3267  			Type: JSStreamQuorumLostAdvisoryType,
  3268  			ID:   nuid.Next(),
  3269  			Time: time.Now().UTC(),
  3270  		},
  3271  		Stream:   stream,
  3272  		Replicas: s.replicas(node),
  3273  		Domain:   s.getOpts().JetStreamDomain,
  3274  	}
  3275  
  3276  	// Send to the user's account if not the system account.
  3277  	if acc != s.SystemAccount() {
  3278  		s.publishAdvisory(acc, subj, adv)
  3279  	}
  3280  	// Now do system level one. Place account info in adv, and nil account means system.
  3281  	adv.Account = acc.GetName()
  3282  	s.publishAdvisory(nil, subj, adv)
  3283  }
  3284  
  3285  func (s *Server) sendStreamLeaderElectAdvisory(mset *stream) {
  3286  	if mset == nil {
  3287  		return
  3288  	}
  3289  	node, stream, acc := mset.raftNode(), mset.name(), mset.account()
  3290  	if node == nil {
  3291  		return
  3292  	}
  3293  	subj := JSAdvisoryStreamLeaderElectedPre + "." + stream
  3294  	adv := &JSStreamLeaderElectedAdvisory{
  3295  		TypedEvent: TypedEvent{
  3296  			Type: JSStreamLeaderElectedAdvisoryType,
  3297  			ID:   nuid.Next(),
  3298  			Time: time.Now().UTC(),
  3299  		},
  3300  		Stream:   stream,
  3301  		Leader:   s.serverNameForNode(node.GroupLeader()),
  3302  		Replicas: s.replicas(node),
  3303  		Domain:   s.getOpts().JetStreamDomain,
  3304  	}
  3305  
  3306  	// Send to the user's account if not the system account.
  3307  	if acc != s.SystemAccount() {
  3308  		s.publishAdvisory(acc, subj, adv)
  3309  	}
  3310  	// Now do system level one. Place account info in adv, and nil account means system.
  3311  	adv.Account = acc.GetName()
  3312  	s.publishAdvisory(nil, subj, adv)
  3313  }
  3314  
  3315  // Will lookup a stream assignment.
  3316  // Lock should be held.
  3317  func (js *jetStream) streamAssignment(account, stream string) (sa *streamAssignment) {
  3318  	cc := js.cluster
  3319  	if cc == nil {
  3320  		return nil
  3321  	}
  3322  
  3323  	if as := cc.streams[account]; as != nil {
  3324  		sa = as[stream]
  3325  	}
  3326  	return sa
  3327  }
  3328  
  3329  // processStreamAssignment is called when followers have replicated an assignment.
  3330  func (js *jetStream) processStreamAssignment(sa *streamAssignment) bool {
  3331  	js.mu.Lock()
  3332  	s, cc := js.srv, js.cluster
  3333  	accName, stream := sa.Client.serviceAccount(), sa.Config.Name
  3334  	noMeta := cc == nil || cc.meta == nil
  3335  	var ourID string
  3336  	if !noMeta {
  3337  		ourID = cc.meta.ID()
  3338  	}
  3339  	var isMember bool
  3340  	if sa.Group != nil && ourID != _EMPTY_ {
  3341  		isMember = sa.Group.isMember(ourID)
  3342  	}
  3343  
  3344  	// Remove this stream from the inflight proposals
  3345  	cc.removeInflightProposal(accName, sa.Config.Name)
  3346  
  3347  	if s == nil || noMeta {
  3348  		js.mu.Unlock()
  3349  		return false
  3350  	}
  3351  
  3352  	accStreams := cc.streams[accName]
  3353  	if accStreams == nil {
  3354  		accStreams = make(map[string]*streamAssignment)
  3355  	} else if osa := accStreams[stream]; osa != nil && osa != sa {
  3356  		// Copy over private existing state from former SA.
  3357  		if sa.Group != nil {
  3358  			sa.Group.node = osa.Group.node
  3359  		}
  3360  		sa.consumers = osa.consumers
  3361  		sa.responded = osa.responded
  3362  		sa.err = osa.err
  3363  	}
  3364  
  3365  	// Update our state.
  3366  	accStreams[stream] = sa
  3367  	cc.streams[accName] = accStreams
  3368  	hasResponded := sa.responded
  3369  	js.mu.Unlock()
  3370  
  3371  	acc, err := s.LookupAccount(accName)
  3372  	if err != nil {
  3373  		ll := fmt.Sprintf("Account [%s] lookup for stream create failed: %v", accName, err)
  3374  		if isMember {
  3375  			if !hasResponded {
  3376  				// If we can not lookup the account and we are a member, send this result back to the metacontroller leader.
  3377  				result := &streamAssignmentResult{
  3378  					Account:  accName,
  3379  					Stream:   stream,
  3380  					Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}},
  3381  				}
  3382  				result.Response.Error = NewJSNoAccountError()
  3383  				s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result)
  3384  			}
  3385  			s.Warnf(ll)
  3386  		} else {
  3387  			s.Debugf(ll)
  3388  		}
  3389  		return false
  3390  	}
  3391  
  3392  	var didRemove bool
  3393  
  3394  	// Check if this is for us..
  3395  	if isMember {
  3396  		js.processClusterCreateStream(acc, sa)
  3397  	} else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil {
  3398  		// We have one here even though we are not a member. This can happen on re-assignment.
  3399  		s.removeStream(ourID, mset, sa)
  3400  	}
  3401  
  3402  	// If this stream assignment does not have a sync subject (bug) set that the meta-leader should check when elected.
  3403  	if sa.Sync == _EMPTY_ {
  3404  		js.mu.Lock()
  3405  		cc.streamsCheck = true
  3406  		js.mu.Unlock()
  3407  		return false
  3408  	}
  3409  
  3410  	return didRemove
  3411  }
  3412  
  3413  // processUpdateStreamAssignment is called when followers have replicated an updated assignment.
  3414  func (js *jetStream) processUpdateStreamAssignment(sa *streamAssignment) {
  3415  	js.mu.RLock()
  3416  	s, cc := js.srv, js.cluster
  3417  	js.mu.RUnlock()
  3418  	if s == nil || cc == nil {
  3419  		// TODO(dlc) - debug at least
  3420  		return
  3421  	}
  3422  
  3423  	accName := sa.Client.serviceAccount()
  3424  	stream := sa.Config.Name
  3425  
  3426  	js.mu.Lock()
  3427  	if cc.meta == nil {
  3428  		js.mu.Unlock()
  3429  		return
  3430  	}
  3431  	ourID := cc.meta.ID()
  3432  
  3433  	var isMember bool
  3434  	if sa.Group != nil {
  3435  		isMember = sa.Group.isMember(ourID)
  3436  	}
  3437  
  3438  	accStreams := cc.streams[accName]
  3439  	if accStreams == nil {
  3440  		js.mu.Unlock()
  3441  		return
  3442  	}
  3443  	osa := accStreams[stream]
  3444  	if osa == nil {
  3445  		js.mu.Unlock()
  3446  		return
  3447  	}
  3448  
  3449  	// Copy over private existing state from former SA.
  3450  	if sa.Group != nil {
  3451  		sa.Group.node = osa.Group.node
  3452  	}
  3453  	sa.consumers = osa.consumers
  3454  	sa.err = osa.err
  3455  
  3456  	// If we detect we are scaling down to 1, non-clustered, and we had a previous node, clear it here.
  3457  	if sa.Config.Replicas == 1 && sa.Group.node != nil {
  3458  		sa.Group.node = nil
  3459  	}
  3460  
  3461  	// Update our state.
  3462  	accStreams[stream] = sa
  3463  	cc.streams[accName] = accStreams
  3464  
  3465  	// Make sure we respond if we are a member.
  3466  	if isMember {
  3467  		sa.responded = false
  3468  	} else {
  3469  		// Make sure to clean up any old node in case this stream moves back here.
  3470  		if sa.Group != nil {
  3471  			sa.Group.node = nil
  3472  		}
  3473  	}
  3474  	js.mu.Unlock()
  3475  
  3476  	acc, err := s.LookupAccount(accName)
  3477  	if err != nil {
  3478  		s.Warnf("Update Stream Account %s, error on lookup: %v", accName, err)
  3479  		return
  3480  	}
  3481  
  3482  	// Check if this is for us..
  3483  	if isMember {
  3484  		js.processClusterUpdateStream(acc, osa, sa)
  3485  	} else if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil {
  3486  		// We have one here even though we are not a member. This can happen on re-assignment.
  3487  		s.removeStream(ourID, mset, sa)
  3488  	}
  3489  }
  3490  
  3491  // Common function to remove ourself from this server.
  3492  // This can happen on re-assignment, move, etc
  3493  func (s *Server) removeStream(ourID string, mset *stream, nsa *streamAssignment) {
  3494  	if mset == nil {
  3495  		return
  3496  	}
  3497  	// Make sure to use the new stream assignment, not our own.
  3498  	s.Debugf("JetStream removing stream '%s > %s' from this server", nsa.Client.serviceAccount(), nsa.Config.Name)
  3499  	if node := mset.raftNode(); node != nil {
  3500  		if node.Leader() {
  3501  			node.StepDown(nsa.Group.Preferred)
  3502  		}
  3503  		node.ProposeRemovePeer(ourID)
  3504  		// shutdown monitor by shutting down raft.
  3505  		node.Delete()
  3506  	}
  3507  
  3508  	var isShuttingDown bool
  3509  	// Make sure this node is no longer attached to our stream assignment.
  3510  	if js, _ := s.getJetStreamCluster(); js != nil {
  3511  		js.mu.Lock()
  3512  		nsa.Group.node = nil
  3513  		isShuttingDown = js.shuttingDown
  3514  		js.mu.Unlock()
  3515  	}
  3516  
  3517  	if !isShuttingDown {
  3518  		// wait for monitor to be shutdown.
  3519  		mset.monitorWg.Wait()
  3520  	}
  3521  	mset.stop(true, false)
  3522  }
  3523  
  3524  // processClusterUpdateStream is called when we have a stream assignment that
  3525  // has been updated for an existing assignment and we are a member.
  3526  func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAssignment) {
  3527  	if sa == nil {
  3528  		return
  3529  	}
  3530  
  3531  	js.mu.Lock()
  3532  	s, rg := js.srv, sa.Group
  3533  	client, subject, reply := sa.Client, sa.Subject, sa.Reply
  3534  	alreadyRunning, numReplicas := osa.Group.node != nil, len(rg.Peers)
  3535  	needsNode := rg.node == nil
  3536  	storage, cfg := sa.Config.Storage, sa.Config
  3537  	hasResponded := sa.responded
  3538  	sa.responded = true
  3539  	recovering := sa.recovering
  3540  	js.mu.Unlock()
  3541  
  3542  	mset, err := acc.lookupStream(cfg.Name)
  3543  	if err == nil && mset != nil {
  3544  		// Make sure we have not had a new group assigned to us.
  3545  		if osa.Group.Name != sa.Group.Name {
  3546  			s.Warnf("JetStream cluster detected stream remapping for '%s > %s' from %q to %q",
  3547  				acc, cfg.Name, osa.Group.Name, sa.Group.Name)
  3548  			mset.removeNode()
  3549  			alreadyRunning, needsNode = false, true
  3550  			// Make sure to clear from original.
  3551  			js.mu.Lock()
  3552  			osa.Group.node = nil
  3553  			js.mu.Unlock()
  3554  		}
  3555  
  3556  		var needsSetLeader bool
  3557  		if !alreadyRunning && numReplicas > 1 {
  3558  			if needsNode {
  3559  				mset.setLeader(false)
  3560  				js.createRaftGroup(acc.GetName(), rg, storage, pprofLabels{
  3561  					"type":    "stream",
  3562  					"account": mset.accName(),
  3563  					"stream":  mset.name(),
  3564  				})
  3565  			}
  3566  			mset.monitorWg.Add(1)
  3567  			// Start monitoring..
  3568  			s.startGoRoutine(
  3569  				func() { js.monitorStream(mset, sa, needsNode) },
  3570  				pprofLabels{
  3571  					"type":    "stream",
  3572  					"account": mset.accName(),
  3573  					"stream":  mset.name(),
  3574  				},
  3575  			)
  3576  		} else if numReplicas == 1 && alreadyRunning {
  3577  			// We downgraded to R1. Make sure we cleanup the raft node and the stream monitor.
  3578  			mset.removeNode()
  3579  			// Make sure we are leader now that we are R1.
  3580  			needsSetLeader = true
  3581  			// In case we need to shutdown the cluster specific subs, etc.
  3582  			mset.setLeader(false)
  3583  			js.mu.Lock()
  3584  			rg.node = nil
  3585  			js.mu.Unlock()
  3586  		}
  3587  		// Call update.
  3588  		if err = mset.updateWithAdvisory(cfg, !recovering); err != nil {
  3589  			s.Warnf("JetStream cluster error updating stream %q for account %q: %v", cfg.Name, acc.Name, err)
  3590  		}
  3591  		// Set the new stream assignment.
  3592  		mset.setStreamAssignment(sa)
  3593  		// Make sure we are the leader now that we are R1.
  3594  		if needsSetLeader {
  3595  			mset.setLeader(true)
  3596  		}
  3597  	}
  3598  
  3599  	// If not found we must be expanding into this node since if we are here we know we are a member.
  3600  	if err == ErrJetStreamStreamNotFound {
  3601  		js.processStreamAssignment(sa)
  3602  		return
  3603  	}
  3604  
  3605  	if err != nil {
  3606  		js.mu.Lock()
  3607  		sa.err = err
  3608  		result := &streamAssignmentResult{
  3609  			Account:  sa.Client.serviceAccount(),
  3610  			Stream:   sa.Config.Name,
  3611  			Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}},
  3612  			Update:   true,
  3613  		}
  3614  		result.Response.Error = NewJSStreamGeneralError(err, Unless(err))
  3615  		js.mu.Unlock()
  3616  
  3617  		// Send response to the metadata leader. They will forward to the user as needed.
  3618  		s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result)
  3619  		return
  3620  	}
  3621  
  3622  	isLeader := mset.IsLeader()
  3623  
  3624  	// Check for missing syncSubject bug.
  3625  	if isLeader && osa != nil && osa.Sync == _EMPTY_ {
  3626  		if node := mset.raftNode(); node != nil {
  3627  			node.StepDown()
  3628  		}
  3629  		return
  3630  	}
  3631  
  3632  	// If we were a single node being promoted assume leadership role for purpose of responding.
  3633  	if !hasResponded && !isLeader && !alreadyRunning {
  3634  		isLeader = true
  3635  	}
  3636  
  3637  	// Check if we should bail.
  3638  	if !isLeader || hasResponded || recovering {
  3639  		return
  3640  	}
  3641  
  3642  	// Send our response.
  3643  	var resp = JSApiStreamUpdateResponse{ApiResponse: ApiResponse{Type: JSApiStreamUpdateResponseType}}
  3644  	resp.StreamInfo = &StreamInfo{
  3645  		Created:   mset.createdTime(),
  3646  		State:     mset.state(),
  3647  		Config:    mset.config(),
  3648  		Cluster:   js.clusterInfo(mset.raftGroup()),
  3649  		Mirror:    mset.mirrorInfo(),
  3650  		Sources:   mset.sourcesInfo(),
  3651  		TimeStamp: time.Now().UTC(),
  3652  	}
  3653  
  3654  	s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  3655  }
  3656  
  3657  // processClusterCreateStream is called when we have a stream assignment that
  3658  // has been committed and this server is a member of the peer group.
  3659  func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignment) {
  3660  	if sa == nil {
  3661  		return
  3662  	}
  3663  
  3664  	js.mu.RLock()
  3665  	s, rg := js.srv, sa.Group
  3666  	alreadyRunning := rg.node != nil
  3667  	storage := sa.Config.Storage
  3668  	restore := sa.Restore
  3669  	js.mu.RUnlock()
  3670  
  3671  	// Process the raft group and make sure it's running if needed.
  3672  	err := js.createRaftGroup(acc.GetName(), rg, storage, pprofLabels{
  3673  		"type":    "stream",
  3674  		"account": acc.Name,
  3675  		"stream":  sa.Config.Name,
  3676  	})
  3677  
  3678  	// If we are restoring, create the stream if we are R>1 and not the preferred who handles the
  3679  	// receipt of the snapshot itself.
  3680  	shouldCreate := true
  3681  	if restore != nil {
  3682  		if len(rg.Peers) == 1 || rg.node != nil && rg.node.ID() == rg.Preferred {
  3683  			shouldCreate = false
  3684  		} else {
  3685  			js.mu.Lock()
  3686  			sa.Restore = nil
  3687  			js.mu.Unlock()
  3688  		}
  3689  	}
  3690  
  3691  	// Our stream.
  3692  	var mset *stream
  3693  
  3694  	// Process here if not restoring or not the leader.
  3695  	if shouldCreate && err == nil {
  3696  		// Go ahead and create or update the stream.
  3697  		mset, err = acc.lookupStream(sa.Config.Name)
  3698  		if err == nil && mset != nil {
  3699  			osa := mset.streamAssignment()
  3700  			// If we already have a stream assignment and they are the same exact config, short circuit here.
  3701  			if osa != nil {
  3702  				if reflect.DeepEqual(osa.Config, sa.Config) {
  3703  					if sa.Group.Name == osa.Group.Name && reflect.DeepEqual(sa.Group.Peers, osa.Group.Peers) {
  3704  						// Since this already exists we know it succeeded, just respond to this caller.
  3705  						js.mu.RLock()
  3706  						client, subject, reply, recovering := sa.Client, sa.Subject, sa.Reply, sa.recovering
  3707  						js.mu.RUnlock()
  3708  
  3709  						if !recovering {
  3710  							var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}
  3711  							resp.StreamInfo = &StreamInfo{
  3712  								Created:   mset.createdTime(),
  3713  								State:     mset.state(),
  3714  								Config:    mset.config(),
  3715  								Cluster:   js.clusterInfo(mset.raftGroup()),
  3716  								Sources:   mset.sourcesInfo(),
  3717  								Mirror:    mset.mirrorInfo(),
  3718  								TimeStamp: time.Now().UTC(),
  3719  							}
  3720  							s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  3721  						}
  3722  						return
  3723  					} else {
  3724  						// We had a bug where we could have multiple assignments for the same
  3725  						// stream but with different group assignments, including multiple raft
  3726  						// groups. So check for that here. We can only bet on the last one being
  3727  						// consistent in the long run, so let it continue if we see this condition.
  3728  						s.Warnf("JetStream cluster detected duplicate assignment for stream %q for account %q", sa.Config.Name, acc.Name)
  3729  						if osa.Group.node != nil && osa.Group.node != sa.Group.node {
  3730  							osa.Group.node.Delete()
  3731  							osa.Group.node = nil
  3732  						}
  3733  					}
  3734  				}
  3735  			}
  3736  			mset.setStreamAssignment(sa)
  3737  			// Check if our config has really been updated.
  3738  			if !reflect.DeepEqual(mset.config(), sa.Config) {
  3739  				if err = mset.updateWithAdvisory(sa.Config, false); err != nil {
  3740  					s.Warnf("JetStream cluster error updating stream %q for account %q: %v", sa.Config.Name, acc.Name, err)
  3741  					if osa != nil {
  3742  						// Process the raft group and make sure it's running if needed.
  3743  						js.createRaftGroup(acc.GetName(), osa.Group, storage, pprofLabels{
  3744  							"type":    "stream",
  3745  							"account": mset.accName(),
  3746  							"stream":  mset.name(),
  3747  						})
  3748  						mset.setStreamAssignment(osa)
  3749  					}
  3750  					if rg.node != nil {
  3751  						rg.node.Delete()
  3752  						rg.node = nil
  3753  					}
  3754  				}
  3755  			}
  3756  		} else if err == NewJSStreamNotFoundError() {
  3757  			// Add in the stream here.
  3758  			mset, err = acc.addStreamWithAssignment(sa.Config, nil, sa)
  3759  		}
  3760  		if mset != nil {
  3761  			mset.setCreatedTime(sa.Created)
  3762  		}
  3763  	}
  3764  
  3765  	// This is an error condition.
  3766  	if err != nil {
  3767  		if IsNatsErr(err, JSStreamStoreFailedF) {
  3768  			s.Warnf("Stream create failed for '%s > %s': %v", sa.Client.serviceAccount(), sa.Config.Name, err)
  3769  			err = errStreamStoreFailed
  3770  		}
  3771  		js.mu.Lock()
  3772  
  3773  		sa.err = err
  3774  		hasResponded := sa.responded
  3775  
  3776  		// If out of space do nothing for now.
  3777  		if isOutOfSpaceErr(err) {
  3778  			hasResponded = true
  3779  		}
  3780  
  3781  		if rg.node != nil {
  3782  			rg.node.Delete()
  3783  		}
  3784  
  3785  		var result *streamAssignmentResult
  3786  		if !hasResponded {
  3787  			result = &streamAssignmentResult{
  3788  				Account:  sa.Client.serviceAccount(),
  3789  				Stream:   sa.Config.Name,
  3790  				Response: &JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}},
  3791  			}
  3792  			result.Response.Error = NewJSStreamCreateError(err, Unless(err))
  3793  		}
  3794  		js.mu.Unlock()
  3795  
  3796  		// Send response to the metadata leader. They will forward to the user as needed.
  3797  		if result != nil {
  3798  			s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, result)
  3799  		}
  3800  		return
  3801  	}
  3802  
  3803  	// Re-capture node.
  3804  	js.mu.RLock()
  3805  	node := rg.node
  3806  	js.mu.RUnlock()
  3807  
  3808  	// Start our monitoring routine.
  3809  	if node != nil {
  3810  		if !alreadyRunning {
  3811  			if mset != nil {
  3812  				mset.monitorWg.Add(1)
  3813  			}
  3814  			s.startGoRoutine(
  3815  				func() { js.monitorStream(mset, sa, false) },
  3816  				pprofLabels{
  3817  					"type":    "stream",
  3818  					"account": mset.accName(),
  3819  					"stream":  mset.name(),
  3820  				},
  3821  			)
  3822  		}
  3823  	} else {
  3824  		// Single replica stream, process manually here.
  3825  		// If we are restoring, process that first.
  3826  		if sa.Restore != nil {
  3827  			// We are restoring a stream here.
  3828  			restoreDoneCh := s.processStreamRestore(sa.Client, acc, sa.Config, _EMPTY_, sa.Reply, _EMPTY_)
  3829  			s.startGoRoutine(func() {
  3830  				defer s.grWG.Done()
  3831  				select {
  3832  				case err := <-restoreDoneCh:
  3833  					if err == nil {
  3834  						mset, err = acc.lookupStream(sa.Config.Name)
  3835  						if mset != nil {
  3836  							mset.setStreamAssignment(sa)
  3837  							mset.setCreatedTime(sa.Created)
  3838  						}
  3839  					}
  3840  					if err != nil {
  3841  						if mset != nil {
  3842  							mset.delete()
  3843  						}
  3844  						js.mu.Lock()
  3845  						sa.err = err
  3846  						result := &streamAssignmentResult{
  3847  							Account: sa.Client.serviceAccount(),
  3848  							Stream:  sa.Config.Name,
  3849  							Restore: &JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}},
  3850  						}
  3851  						result.Restore.Error = NewJSStreamRestoreError(err, Unless(err))
  3852  						js.mu.Unlock()
  3853  						// Send response to the metadata leader. They will forward to the user as needed.
  3854  						b, _ := json.Marshal(result) // Avoids auto-processing and doing fancy json with newlines.
  3855  						s.sendInternalMsgLocked(streamAssignmentSubj, _EMPTY_, nil, b)
  3856  						return
  3857  					}
  3858  					js.processStreamLeaderChange(mset, true)
  3859  
  3860  					// Check to see if we have restored consumers here.
  3861  					// These are not currently assigned so we will need to do so here.
  3862  					if consumers := mset.getPublicConsumers(); len(consumers) > 0 {
  3863  						js.mu.RLock()
  3864  						cc := js.cluster
  3865  						js.mu.RUnlock()
  3866  
  3867  						for _, o := range consumers {
  3868  							name, cfg := o.String(), o.config()
  3869  							rg := cc.createGroupForConsumer(&cfg, sa)
  3870  
  3871  							// Place our initial state here as well for assignment distribution.
  3872  							ca := &consumerAssignment{
  3873  								Group:   rg,
  3874  								Stream:  sa.Config.Name,
  3875  								Name:    name,
  3876  								Config:  &cfg,
  3877  								Client:  sa.Client,
  3878  								Created: o.createdTime(),
  3879  							}
  3880  
  3881  							addEntry := encodeAddConsumerAssignment(ca)
  3882  							cc.meta.ForwardProposal(addEntry)
  3883  
  3884  							// Check to make sure we see the assignment.
  3885  							go func() {
  3886  								ticker := time.NewTicker(time.Second)
  3887  								defer ticker.Stop()
  3888  								for range ticker.C {
  3889  									js.mu.RLock()
  3890  									ca, meta := js.consumerAssignment(ca.Client.serviceAccount(), sa.Config.Name, name), cc.meta
  3891  									js.mu.RUnlock()
  3892  									if ca == nil {
  3893  										s.Warnf("Consumer assignment has not been assigned, retrying")
  3894  										if meta != nil {
  3895  											meta.ForwardProposal(addEntry)
  3896  										} else {
  3897  											return
  3898  										}
  3899  									} else {
  3900  										return
  3901  									}
  3902  								}
  3903  							}()
  3904  						}
  3905  					}
  3906  				case <-s.quitCh:
  3907  					return
  3908  				}
  3909  			})
  3910  		} else {
  3911  			js.processStreamLeaderChange(mset, true)
  3912  		}
  3913  	}
  3914  }
  3915  
  3916  // processStreamRemoval is called when followers have replicated an assignment.
  3917  func (js *jetStream) processStreamRemoval(sa *streamAssignment) {
  3918  	js.mu.Lock()
  3919  	s, cc := js.srv, js.cluster
  3920  	if s == nil || cc == nil || cc.meta == nil {
  3921  		// TODO(dlc) - debug at least
  3922  		js.mu.Unlock()
  3923  		return
  3924  	}
  3925  	stream := sa.Config.Name
  3926  	isMember := sa.Group.isMember(cc.meta.ID())
  3927  	wasLeader := cc.isStreamLeader(sa.Client.serviceAccount(), stream)
  3928  
  3929  	// Check if we already have this assigned.
  3930  	accStreams := cc.streams[sa.Client.serviceAccount()]
  3931  	needDelete := accStreams != nil && accStreams[stream] != nil
  3932  	if needDelete {
  3933  		delete(accStreams, stream)
  3934  		if len(accStreams) == 0 {
  3935  			delete(cc.streams, sa.Client.serviceAccount())
  3936  		}
  3937  	}
  3938  	js.mu.Unlock()
  3939  
  3940  	if needDelete {
  3941  		js.processClusterDeleteStream(sa, isMember, wasLeader)
  3942  	}
  3943  }
  3944  
  3945  func (js *jetStream) processClusterDeleteStream(sa *streamAssignment, isMember, wasLeader bool) {
  3946  	if sa == nil {
  3947  		return
  3948  	}
  3949  	js.mu.RLock()
  3950  	s := js.srv
  3951  	node := sa.Group.node
  3952  	hadLeader := node == nil || node.GroupLeader() != noLeader
  3953  	offline := s.allPeersOffline(sa.Group)
  3954  	var isMetaLeader bool
  3955  	if cc := js.cluster; cc != nil {
  3956  		isMetaLeader = cc.isLeader()
  3957  	}
  3958  	recovering := sa.recovering
  3959  	js.mu.RUnlock()
  3960  
  3961  	stopped := false
  3962  	var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}}
  3963  	var err error
  3964  	var acc *Account
  3965  
  3966  	// Go ahead and delete the stream if we have it and the account here.
  3967  	if acc, _ = s.LookupAccount(sa.Client.serviceAccount()); acc != nil {
  3968  		if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil {
  3969  			// shut down monitor by shutting down raft
  3970  			if n := mset.raftNode(); n != nil {
  3971  				n.Delete()
  3972  			}
  3973  			// wait for monitor to be shut down
  3974  			mset.monitorWg.Wait()
  3975  			err = mset.stop(true, wasLeader)
  3976  			stopped = true
  3977  		} else if isMember {
  3978  			s.Warnf("JetStream failed to lookup running stream while removing stream '%s > %s' from this server",
  3979  				sa.Client.serviceAccount(), sa.Config.Name)
  3980  		}
  3981  	} else if isMember {
  3982  		s.Warnf("JetStream failed to lookup account while removing stream '%s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name)
  3983  	}
  3984  
  3985  	// Always delete the node if present.
  3986  	if node != nil {
  3987  		node.Delete()
  3988  	}
  3989  
  3990  	// This is a stop gap cleanup in case
  3991  	// 1) the account does not exist (and mset couldn't be stopped) and/or
  3992  	// 2) node was nil (and couldn't be deleted)
  3993  	if !stopped || node == nil {
  3994  		if sacc := s.SystemAccount(); sacc != nil {
  3995  			saccName := sacc.GetName()
  3996  			os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, sa.Group.Name))
  3997  			// cleanup dependent consumer groups
  3998  			if !stopped {
  3999  				for _, ca := range sa.consumers {
  4000  					// Make sure we cleanup any possible running nodes for the consumers.
  4001  					if isMember && ca.Group != nil && ca.Group.node != nil {
  4002  						ca.Group.node.Delete()
  4003  					}
  4004  					os.RemoveAll(filepath.Join(js.config.StoreDir, saccName, defaultStoreDirName, ca.Group.Name))
  4005  				}
  4006  			}
  4007  		}
  4008  	}
  4009  	accDir := filepath.Join(js.config.StoreDir, sa.Client.serviceAccount())
  4010  	streamDir := filepath.Join(accDir, streamsDir)
  4011  	os.RemoveAll(filepath.Join(streamDir, sa.Config.Name))
  4012  
  4013  	// no op if not empty
  4014  	os.Remove(streamDir)
  4015  	os.Remove(accDir)
  4016  
  4017  	// Normally we want only the leader to respond here, but if we had no leader then all members will respond to make
  4018  	// sure we get feedback to the user.
  4019  	if !isMember || (hadLeader && !wasLeader) {
  4020  		// If all the peers are offline and we are the meta leader we will also respond, so suppress returning here.
  4021  		if !(offline && isMetaLeader) {
  4022  			return
  4023  		}
  4024  	}
  4025  
  4026  	// Do not respond if the account does not exist any longer
  4027  	if acc == nil || recovering {
  4028  		return
  4029  	}
  4030  
  4031  	if err != nil {
  4032  		resp.Error = NewJSStreamGeneralError(err, Unless(err))
  4033  		s.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp))
  4034  	} else {
  4035  		resp.Success = true
  4036  		s.sendAPIResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, s.jsonResponse(resp))
  4037  	}
  4038  }
  4039  
  4040  // processConsumerAssignment is called when followers have replicated an assignment for a consumer.
  4041  func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) {
  4042  	js.mu.RLock()
  4043  	s, cc := js.srv, js.cluster
  4044  	accName, stream, consumerName := ca.Client.serviceAccount(), ca.Stream, ca.Name
  4045  	noMeta := cc == nil || cc.meta == nil
  4046  	shuttingDown := js.shuttingDown
  4047  	var ourID string
  4048  	if !noMeta {
  4049  		ourID = cc.meta.ID()
  4050  	}
  4051  	var isMember bool
  4052  	if ca.Group != nil && ourID != _EMPTY_ {
  4053  		isMember = ca.Group.isMember(ourID)
  4054  	}
  4055  	js.mu.RUnlock()
  4056  
  4057  	if s == nil || noMeta || shuttingDown {
  4058  		return
  4059  	}
  4060  
  4061  	sa := js.streamAssignment(accName, stream)
  4062  	if sa == nil {
  4063  		s.Debugf("Consumer create failed, could not locate stream '%s > %s'", accName, stream)
  4064  		return
  4065  	}
  4066  
  4067  	// Might need this below.
  4068  	numReplicas := sa.Config.Replicas
  4069  
  4070  	// Track if this existed already.
  4071  	var wasExisting bool
  4072  
  4073  	// Check if we have an existing consumer assignment.
  4074  	js.mu.Lock()
  4075  	if sa.consumers == nil {
  4076  		sa.consumers = make(map[string]*consumerAssignment)
  4077  	} else if oca := sa.consumers[ca.Name]; oca != nil {
  4078  		wasExisting = true
  4079  		// Copy over private existing state from former SA.
  4080  		if ca.Group != nil {
  4081  			ca.Group.node = oca.Group.node
  4082  		}
  4083  		ca.responded = oca.responded
  4084  		ca.err = oca.err
  4085  	}
  4086  
  4087  	// Capture the optional state. We will pass it along if we are a member to apply.
  4088  	// This is only applicable when restoring a stream with consumers.
  4089  	state := ca.State
  4090  	ca.State = nil
  4091  
  4092  	// Place into our internal map under the stream assignment.
  4093  	// Ok to replace an existing one, we check on process call below.
  4094  	sa.consumers[ca.Name] = ca
  4095  	js.mu.Unlock()
  4096  
  4097  	acc, err := s.LookupAccount(accName)
  4098  	if err != nil {
  4099  		ll := fmt.Sprintf("Account [%s] lookup for consumer create failed: %v", accName, err)
  4100  		if isMember {
  4101  			if !js.isMetaRecovering() {
  4102  				// If we can not lookup the account and we are a member, send this result back to the metacontroller leader.
  4103  				result := &consumerAssignmentResult{
  4104  					Account:  accName,
  4105  					Stream:   stream,
  4106  					Consumer: consumerName,
  4107  					Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}},
  4108  				}
  4109  				result.Response.Error = NewJSNoAccountError()
  4110  				s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result)
  4111  			}
  4112  			s.Warnf(ll)
  4113  		} else {
  4114  			s.Debugf(ll)
  4115  		}
  4116  		return
  4117  	}
  4118  
  4119  	// Check if this is for us..
  4120  	if isMember {
  4121  		js.processClusterCreateConsumer(ca, state, wasExisting)
  4122  	} else {
  4123  		// We need to be removed here, we are no longer assigned.
  4124  		// Grab consumer if we have it.
  4125  		var o *consumer
  4126  		if mset, _ := acc.lookupStream(sa.Config.Name); mset != nil {
  4127  			o = mset.lookupConsumer(ca.Name)
  4128  		}
  4129  
  4130  		// Check if we have a raft node running, meaning we are no longer part of the group but were.
  4131  		js.mu.Lock()
  4132  		if node := ca.Group.node; node != nil {
  4133  			// We have one here even though we are not a member. This can happen on re-assignment.
  4134  			s.Debugf("JetStream removing consumer '%s > %s > %s' from this server", sa.Client.serviceAccount(), sa.Config.Name, ca.Name)
  4135  			if node.Leader() {
  4136  				s.Debugf("JetStream consumer '%s > %s > %s' is being removed and was the leader, will perform stepdown",
  4137  					sa.Client.serviceAccount(), sa.Config.Name, ca.Name)
  4138  
  4139  				peers, cn := node.Peers(), s.cachedClusterName()
  4140  				migrating := numReplicas != len(peers)
  4141  
  4142  				// Select a new peer to transfer to. If we are a migrating make sure its from the new cluster.
  4143  				var npeer string
  4144  				for _, r := range peers {
  4145  					if !r.Current {
  4146  						continue
  4147  					}
  4148  					if !migrating {
  4149  						npeer = r.ID
  4150  						break
  4151  					} else if sir, ok := s.nodeToInfo.Load(r.ID); ok && sir != nil {
  4152  						si := sir.(nodeInfo)
  4153  						if si.cluster != cn {
  4154  							npeer = r.ID
  4155  							break
  4156  						}
  4157  					}
  4158  				}
  4159  				// Clear the raftnode from our consumer so that a subsequent o.delete will not also issue a stepdown.
  4160  				if o != nil {
  4161  					o.clearRaftNode()
  4162  				}
  4163  				// Manually handle the stepdown and deletion of the node.
  4164  				node.UpdateKnownPeers(ca.Group.Peers)
  4165  				node.StepDown(npeer)
  4166  				node.Delete()
  4167  			} else {
  4168  				node.UpdateKnownPeers(ca.Group.Peers)
  4169  			}
  4170  		}
  4171  		// Always clear the old node.
  4172  		ca.Group.node = nil
  4173  		ca.err = nil
  4174  		js.mu.Unlock()
  4175  
  4176  		if o != nil {
  4177  			o.deleteWithoutAdvisory()
  4178  		}
  4179  	}
  4180  }
  4181  
  4182  func (js *jetStream) processConsumerRemoval(ca *consumerAssignment) {
  4183  	js.mu.Lock()
  4184  	s, cc := js.srv, js.cluster
  4185  	if s == nil || cc == nil || cc.meta == nil {
  4186  		// TODO(dlc) - debug at least
  4187  		js.mu.Unlock()
  4188  		return
  4189  	}
  4190  	wasLeader := cc.isConsumerLeader(ca.Client.serviceAccount(), ca.Stream, ca.Name)
  4191  
  4192  	// Delete from our state.
  4193  	var needDelete bool
  4194  	if accStreams := cc.streams[ca.Client.serviceAccount()]; accStreams != nil {
  4195  		if sa := accStreams[ca.Stream]; sa != nil && sa.consumers != nil && sa.consumers[ca.Name] != nil {
  4196  			oca := sa.consumers[ca.Name]
  4197  			// Make sure this removal is for what we have, otherwise ignore.
  4198  			if ca.Group != nil && oca.Group != nil && ca.Group.Name == oca.Group.Name {
  4199  				needDelete = true
  4200  				oca.deleted = true
  4201  				delete(sa.consumers, ca.Name)
  4202  			}
  4203  		}
  4204  	}
  4205  	js.mu.Unlock()
  4206  
  4207  	if needDelete {
  4208  		js.processClusterDeleteConsumer(ca, wasLeader)
  4209  	}
  4210  }
  4211  
  4212  type consumerAssignmentResult struct {
  4213  	Account  string                       `json:"account"`
  4214  	Stream   string                       `json:"stream"`
  4215  	Consumer string                       `json:"consumer"`
  4216  	Response *JSApiConsumerCreateResponse `json:"response,omitempty"`
  4217  }
  4218  
  4219  // processClusterCreateConsumer is when we are a member of the group and need to create the consumer.
  4220  func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state *ConsumerState, wasExisting bool) {
  4221  	if ca == nil {
  4222  		return
  4223  	}
  4224  	js.mu.RLock()
  4225  	s := js.srv
  4226  	rg := ca.Group
  4227  	alreadyRunning := rg != nil && rg.node != nil
  4228  	accName, stream, consumer := ca.Client.serviceAccount(), ca.Stream, ca.Name
  4229  	js.mu.RUnlock()
  4230  
  4231  	acc, err := s.LookupAccount(accName)
  4232  	if err != nil {
  4233  		s.Warnf("JetStream cluster failed to lookup axccount %q: %v", accName, err)
  4234  		return
  4235  	}
  4236  
  4237  	// Go ahead and create or update the consumer.
  4238  	mset, err := acc.lookupStream(stream)
  4239  	if err != nil {
  4240  		if !js.isMetaRecovering() {
  4241  			js.mu.Lock()
  4242  			s.Warnf("Consumer create failed, could not locate stream '%s > %s > %s'", ca.Client.serviceAccount(), ca.Stream, ca.Name)
  4243  			ca.err = NewJSStreamNotFoundError()
  4244  			result := &consumerAssignmentResult{
  4245  				Account:  ca.Client.serviceAccount(),
  4246  				Stream:   ca.Stream,
  4247  				Consumer: ca.Name,
  4248  				Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}},
  4249  			}
  4250  			result.Response.Error = NewJSStreamNotFoundError()
  4251  			s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result)
  4252  			js.mu.Unlock()
  4253  		}
  4254  		return
  4255  	}
  4256  
  4257  	// Check if we already have this consumer running.
  4258  	o := mset.lookupConsumer(consumer)
  4259  
  4260  	if !alreadyRunning {
  4261  		// Process the raft group and make sure its running if needed.
  4262  		storage := mset.config().Storage
  4263  		if ca.Config.MemoryStorage {
  4264  			storage = MemoryStorage
  4265  		}
  4266  		// No-op if R1.
  4267  		js.createRaftGroup(accName, rg, storage, pprofLabels{
  4268  			"type":     "consumer",
  4269  			"account":  mset.accName(),
  4270  			"stream":   ca.Stream,
  4271  			"consumer": ca.Name,
  4272  		})
  4273  	} else {
  4274  		// If we are clustered update the known peers.
  4275  		js.mu.RLock()
  4276  		if node := rg.node; node != nil {
  4277  			node.UpdateKnownPeers(ca.Group.Peers)
  4278  		}
  4279  		js.mu.RUnlock()
  4280  	}
  4281  
  4282  	// Check if we already have this consumer running.
  4283  	var didCreate, isConfigUpdate, needsLocalResponse bool
  4284  	if o == nil {
  4285  		// Add in the consumer if needed.
  4286  		if o, err = mset.addConsumerWithAssignment(ca.Config, ca.Name, ca, js.isMetaRecovering(), ActionCreateOrUpdate); err == nil {
  4287  			didCreate = true
  4288  		}
  4289  	} else {
  4290  		// This consumer exists.
  4291  		// Only update if config is really different.
  4292  		cfg := o.config()
  4293  		if isConfigUpdate = !reflect.DeepEqual(&cfg, ca.Config); isConfigUpdate {
  4294  			// Call into update, ignore consumer exists error here since this means an old deliver subject is bound
  4295  			// which can happen on restart etc.
  4296  			if err := o.updateConfig(ca.Config); err != nil && err != NewJSConsumerNameExistError() {
  4297  				// This is essentially an update that has failed. Respond back to metaleader if we are not recovering.
  4298  				js.mu.RLock()
  4299  				if !js.metaRecovering {
  4300  					result := &consumerAssignmentResult{
  4301  						Account:  accName,
  4302  						Stream:   stream,
  4303  						Consumer: consumer,
  4304  						Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}},
  4305  					}
  4306  					result.Response.Error = NewJSConsumerNameExistError()
  4307  					s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, result)
  4308  				}
  4309  				s.Warnf("Consumer create failed during update for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err)
  4310  				js.mu.RUnlock()
  4311  				return
  4312  			}
  4313  		}
  4314  
  4315  		var sendState bool
  4316  		js.mu.RLock()
  4317  		n := rg.node
  4318  		// Check if we already had a consumer assignment and its still pending.
  4319  		cca, oca := ca, o.consumerAssignment()
  4320  		if oca != nil {
  4321  			if !oca.responded {
  4322  				// We can't override info for replying here otherwise leader once elected can not respond.
  4323  				// So copy over original client and the reply from the old ca.
  4324  				cac := *ca
  4325  				cac.Client = oca.Client
  4326  				cac.Reply = oca.Reply
  4327  				cca = &cac
  4328  				needsLocalResponse = true
  4329  			}
  4330  			// If we look like we are scaling up, let's send our current state to the group.
  4331  			sendState = len(ca.Group.Peers) > len(oca.Group.Peers) && o.IsLeader() && n != nil
  4332  			// Signal that this is an update
  4333  			if ca.Reply != _EMPTY_ {
  4334  				isConfigUpdate = true
  4335  			}
  4336  		}
  4337  		js.mu.RUnlock()
  4338  
  4339  		if sendState {
  4340  			if snap, err := o.store.EncodedState(); err == nil {
  4341  				n.SendSnapshot(snap)
  4342  			}
  4343  		}
  4344  
  4345  		// Set CA for our consumer.
  4346  		o.setConsumerAssignment(cca)
  4347  		s.Debugf("JetStream cluster, consumer '%s > %s > %s' was already running", ca.Client.serviceAccount(), ca.Stream, ca.Name)
  4348  	}
  4349  
  4350  	// If we have an initial state set apply that now.
  4351  	if state != nil && o != nil {
  4352  		o.mu.Lock()
  4353  		err = o.setStoreState(state)
  4354  		o.mu.Unlock()
  4355  	}
  4356  
  4357  	if err != nil {
  4358  		if IsNatsErr(err, JSConsumerStoreFailedErrF) {
  4359  			s.Warnf("Consumer create failed for '%s > %s > %s': %v", ca.Client.serviceAccount(), ca.Stream, ca.Name, err)
  4360  			err = errConsumerStoreFailed
  4361  		}
  4362  
  4363  		js.mu.Lock()
  4364  
  4365  		ca.err = err
  4366  		hasResponded := ca.responded
  4367  
  4368  		// If out of space do nothing for now.
  4369  		if isOutOfSpaceErr(err) {
  4370  			hasResponded = true
  4371  		}
  4372  
  4373  		if rg.node != nil {
  4374  			rg.node.Delete()
  4375  			// Clear the node here.
  4376  			rg.node = nil
  4377  		}
  4378  
  4379  		// If we did seem to create a consumer make sure to stop it.
  4380  		if o != nil {
  4381  			o.stop()
  4382  		}
  4383  
  4384  		var result *consumerAssignmentResult
  4385  		if !hasResponded && !js.metaRecovering {
  4386  			result = &consumerAssignmentResult{
  4387  				Account:  ca.Client.serviceAccount(),
  4388  				Stream:   ca.Stream,
  4389  				Consumer: ca.Name,
  4390  				Response: &JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}},
  4391  			}
  4392  			result.Response.Error = NewJSConsumerCreateError(err, Unless(err))
  4393  		} else if err == errNoInterest {
  4394  			// This is a stranded ephemeral, let's clean this one up.
  4395  			subject := fmt.Sprintf(JSApiConsumerDeleteT, ca.Stream, ca.Name)
  4396  			mset.outq.send(newJSPubMsg(subject, _EMPTY_, _EMPTY_, nil, nil, nil, 0))
  4397  		}
  4398  		js.mu.Unlock()
  4399  
  4400  		if result != nil {
  4401  			// Send response to the metadata leader. They will forward to the user as needed.
  4402  			b, _ := json.Marshal(result) // Avoids auto-processing and doing fancy json with newlines.
  4403  			s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, b)
  4404  		}
  4405  	} else {
  4406  		if didCreate {
  4407  			o.setCreatedTime(ca.Created)
  4408  		} else {
  4409  			// Check for scale down to 1..
  4410  			if rg.node != nil && len(rg.Peers) == 1 {
  4411  				o.clearNode()
  4412  				o.setLeader(true)
  4413  				// Need to clear from rg too.
  4414  				js.mu.Lock()
  4415  				rg.node = nil
  4416  				client, subject, reply := ca.Client, ca.Subject, ca.Reply
  4417  				js.mu.Unlock()
  4418  				var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}
  4419  				resp.ConsumerInfo = o.info()
  4420  				s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  4421  				return
  4422  			}
  4423  		}
  4424  
  4425  		if rg.node == nil {
  4426  			// Single replica consumer, process manually here.
  4427  			js.mu.Lock()
  4428  			// Force response in case we think this is an update.
  4429  			if !js.metaRecovering && isConfigUpdate {
  4430  				ca.responded = false
  4431  			}
  4432  			js.mu.Unlock()
  4433  			js.processConsumerLeaderChange(o, true)
  4434  		} else {
  4435  			// Clustered consumer.
  4436  			// Start our monitoring routine if needed.
  4437  			if !alreadyRunning && o.shouldStartMonitor() {
  4438  				s.startGoRoutine(
  4439  					func() { js.monitorConsumer(o, ca) },
  4440  					pprofLabels{
  4441  						"type":     "consumer",
  4442  						"account":  mset.accName(),
  4443  						"stream":   mset.name(),
  4444  						"consumer": ca.Name,
  4445  					},
  4446  				)
  4447  			}
  4448  			// For existing consumer, only send response if not recovering.
  4449  			if wasExisting && !js.isMetaRecovering() {
  4450  				if o.IsLeader() || (!didCreate && needsLocalResponse) {
  4451  					// Process if existing as an update. Double check that this is not recovered.
  4452  					js.mu.RLock()
  4453  					client, subject, reply, recovering := ca.Client, ca.Subject, ca.Reply, ca.recovering
  4454  					js.mu.RUnlock()
  4455  					if !recovering {
  4456  						var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}
  4457  						resp.ConsumerInfo = o.info()
  4458  						s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  4459  					}
  4460  				}
  4461  			}
  4462  		}
  4463  	}
  4464  }
  4465  
  4466  func (js *jetStream) processClusterDeleteConsumer(ca *consumerAssignment, wasLeader bool) {
  4467  	if ca == nil {
  4468  		return
  4469  	}
  4470  	js.mu.RLock()
  4471  	s := js.srv
  4472  	node := ca.Group.node
  4473  	offline := s.allPeersOffline(ca.Group)
  4474  	var isMetaLeader bool
  4475  	if cc := js.cluster; cc != nil {
  4476  		isMetaLeader = cc.isLeader()
  4477  	}
  4478  	recovering := ca.recovering
  4479  	js.mu.RUnlock()
  4480  
  4481  	var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}}
  4482  	var err error
  4483  	var acc *Account
  4484  
  4485  	// Go ahead and delete the consumer if we have it and the account.
  4486  	if acc, _ = s.LookupAccount(ca.Client.serviceAccount()); acc != nil {
  4487  		if mset, _ := acc.lookupStream(ca.Stream); mset != nil {
  4488  			if o := mset.lookupConsumer(ca.Name); o != nil {
  4489  				err = o.stopWithFlags(true, false, true, wasLeader)
  4490  			}
  4491  		}
  4492  	} else if ca.Group != nil {
  4493  		// We have a missing account, see if we can cleanup.
  4494  		if sacc := s.SystemAccount(); sacc != nil {
  4495  			os.RemoveAll(filepath.Join(js.config.StoreDir, sacc.GetName(), defaultStoreDirName, ca.Group.Name))
  4496  		}
  4497  	}
  4498  
  4499  	// Always delete the node if present.
  4500  	if node != nil {
  4501  		node.Delete()
  4502  	}
  4503  
  4504  	if !wasLeader || ca.Reply == _EMPTY_ {
  4505  		if !(offline && isMetaLeader) {
  4506  			return
  4507  		}
  4508  	}
  4509  
  4510  	// Do not respond if the account does not exist any longer or this is during recovery.
  4511  	if acc == nil || recovering {
  4512  		return
  4513  	}
  4514  
  4515  	if err != nil {
  4516  		resp.Error = NewJSStreamNotFoundError(Unless(err))
  4517  		s.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp))
  4518  	} else {
  4519  		resp.Success = true
  4520  		s.sendAPIResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(resp))
  4521  	}
  4522  }
  4523  
  4524  // Returns the consumer assignment, or nil if not present.
  4525  // Lock should be held.
  4526  func (js *jetStream) consumerAssignment(account, stream, consumer string) *consumerAssignment {
  4527  	if sa := js.streamAssignment(account, stream); sa != nil {
  4528  		return sa.consumers[consumer]
  4529  	}
  4530  	return nil
  4531  }
  4532  
  4533  // consumerAssigned informs us if this server has this consumer assigned.
  4534  func (jsa *jsAccount) consumerAssigned(stream, consumer string) bool {
  4535  	jsa.mu.RLock()
  4536  	js, acc := jsa.js, jsa.account
  4537  	jsa.mu.RUnlock()
  4538  
  4539  	if js == nil {
  4540  		return false
  4541  	}
  4542  	js.mu.RLock()
  4543  	defer js.mu.RUnlock()
  4544  	return js.cluster.isConsumerAssigned(acc, stream, consumer)
  4545  }
  4546  
  4547  // Read lock should be held.
  4548  func (cc *jetStreamCluster) isConsumerAssigned(a *Account, stream, consumer string) bool {
  4549  	// Non-clustered mode always return true.
  4550  	if cc == nil {
  4551  		return true
  4552  	}
  4553  	if cc.meta == nil {
  4554  		return false
  4555  	}
  4556  	var sa *streamAssignment
  4557  	accStreams := cc.streams[a.Name]
  4558  	if accStreams != nil {
  4559  		sa = accStreams[stream]
  4560  	}
  4561  	if sa == nil {
  4562  		// TODO(dlc) - This should not happen.
  4563  		return false
  4564  	}
  4565  	ca := sa.consumers[consumer]
  4566  	if ca == nil {
  4567  		return false
  4568  	}
  4569  	rg := ca.Group
  4570  	// Check if we are the leader of this raftGroup assigned to the stream.
  4571  	ourID := cc.meta.ID()
  4572  	for _, peer := range rg.Peers {
  4573  		if peer == ourID {
  4574  			return true
  4575  		}
  4576  	}
  4577  	return false
  4578  }
  4579  
  4580  // Returns our stream and underlying raft node.
  4581  func (o *consumer) streamAndNode() (*stream, RaftNode) {
  4582  	if o == nil {
  4583  		return nil, nil
  4584  	}
  4585  	o.mu.RLock()
  4586  	defer o.mu.RUnlock()
  4587  	return o.mset, o.node
  4588  }
  4589  
  4590  // Return the replica count for this consumer. If the consumer has been
  4591  // stopped, this will return an error.
  4592  func (o *consumer) replica() (int, error) {
  4593  	o.mu.RLock()
  4594  	oCfg := o.cfg
  4595  	mset := o.mset
  4596  	o.mu.RUnlock()
  4597  	if mset == nil {
  4598  		return 0, errBadConsumer
  4599  	}
  4600  	sCfg := mset.config()
  4601  	return oCfg.replicas(&sCfg), nil
  4602  }
  4603  
  4604  func (o *consumer) raftGroup() *raftGroup {
  4605  	if o == nil {
  4606  		return nil
  4607  	}
  4608  	o.mu.RLock()
  4609  	defer o.mu.RUnlock()
  4610  	if o.ca == nil {
  4611  		return nil
  4612  	}
  4613  	return o.ca.Group
  4614  }
  4615  
  4616  func (o *consumer) clearRaftNode() {
  4617  	if o == nil {
  4618  		return
  4619  	}
  4620  	o.mu.Lock()
  4621  	defer o.mu.Unlock()
  4622  	o.node = nil
  4623  }
  4624  
  4625  func (o *consumer) raftNode() RaftNode {
  4626  	if o == nil {
  4627  		return nil
  4628  	}
  4629  	o.mu.RLock()
  4630  	defer o.mu.RUnlock()
  4631  	return o.node
  4632  }
  4633  
  4634  func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) {
  4635  	s, n, cc := js.server(), o.raftNode(), js.cluster
  4636  	defer s.grWG.Done()
  4637  
  4638  	defer o.clearMonitorRunning()
  4639  
  4640  	if n == nil {
  4641  		s.Warnf("No RAFT group for '%s > %s > %s'", o.acc.Name, ca.Stream, ca.Name)
  4642  		return
  4643  	}
  4644  
  4645  	// Make sure to stop the raft group on exit to prevent accidental memory bloat.
  4646  	// This should be below the checkInMonitor call though to avoid stopping it out
  4647  	// from underneath the one that is running since it will be the same raft node.
  4648  	defer n.Stop()
  4649  
  4650  	qch, lch, aq, uch, ourPeerId := n.QuitC(), n.LeadChangeC(), n.ApplyQ(), o.updateC(), cc.meta.ID()
  4651  
  4652  	s.Debugf("Starting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group())
  4653  	defer s.Debugf("Exiting consumer monitor for '%s > %s > %s' [%s]", o.acc.Name, ca.Stream, ca.Name, n.Group())
  4654  
  4655  	const (
  4656  		compactInterval = 2 * time.Minute
  4657  		compactSizeMin  = 64 * 1024 // What is stored here is always small for consumers.
  4658  		compactNumMin   = 1024
  4659  		minSnapDelta    = 10 * time.Second
  4660  	)
  4661  
  4662  	// Spread these out for large numbers on server restart.
  4663  	rci := time.Duration(rand.Int63n(int64(time.Minute)))
  4664  	t := time.NewTicker(compactInterval + rci)
  4665  	defer t.Stop()
  4666  
  4667  	// Highwayhash key for generating hashes.
  4668  	key := make([]byte, 32)
  4669  	crand.Read(key)
  4670  
  4671  	// Hash of the last snapshot (fixed size in memory).
  4672  	var lastSnap []byte
  4673  	var lastSnapTime time.Time
  4674  
  4675  	// Don't allow the upper layer to install snapshots until we have
  4676  	// fully recovered from disk.
  4677  	recovering := true
  4678  
  4679  	doSnapshot := func(force bool) {
  4680  		// Bail if trying too fast and not in a forced situation.
  4681  		if recovering || (!force && time.Since(lastSnapTime) < minSnapDelta) {
  4682  			return
  4683  		}
  4684  
  4685  		// Check several things to see if we need a snapshot.
  4686  		ne, nb := n.Size()
  4687  		if !n.NeedSnapshot() {
  4688  			// Check if we should compact etc. based on size of log.
  4689  			if !force && ne < compactNumMin && nb < compactSizeMin {
  4690  				return
  4691  			}
  4692  		}
  4693  
  4694  		if snap, err := o.store.EncodedState(); err == nil {
  4695  			hash := highwayhash.Sum(snap, key)
  4696  			// If the state hasn't changed but the log has gone way over
  4697  			// the compaction size then we will want to compact anyway.
  4698  			// This can happen for example when a pull consumer fetches a
  4699  			// lot on an idle stream, log entries get distributed but the
  4700  			// state never changes, therefore the log never gets compacted.
  4701  			if !bytes.Equal(hash[:], lastSnap) || ne >= compactNumMin || nb >= compactSizeMin {
  4702  				if err := n.InstallSnapshot(snap); err == nil {
  4703  					lastSnap, lastSnapTime = hash[:], time.Now()
  4704  				} else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning {
  4705  					s.RateLimitWarnf("Failed to install snapshot for '%s > %s > %s' [%s]: %v", o.acc.Name, ca.Stream, ca.Name, n.Group(), err)
  4706  				}
  4707  			}
  4708  		}
  4709  	}
  4710  
  4711  	// For migration tracking.
  4712  	var mmt *time.Ticker
  4713  	var mmtc <-chan time.Time
  4714  
  4715  	startMigrationMonitoring := func() {
  4716  		if mmt == nil {
  4717  			mmt = time.NewTicker(500 * time.Millisecond)
  4718  			mmtc = mmt.C
  4719  		}
  4720  	}
  4721  
  4722  	stopMigrationMonitoring := func() {
  4723  		if mmt != nil {
  4724  			mmt.Stop()
  4725  			mmt, mmtc = nil, nil
  4726  		}
  4727  	}
  4728  	defer stopMigrationMonitoring()
  4729  
  4730  	// Track if we are leader.
  4731  	var isLeader bool
  4732  
  4733  	for {
  4734  		select {
  4735  		case <-s.quitCh:
  4736  			return
  4737  		case <-qch:
  4738  			return
  4739  		case <-aq.ch:
  4740  			ces := aq.pop()
  4741  			for _, ce := range ces {
  4742  				// No special processing needed for when we are caught up on restart.
  4743  				if ce == nil {
  4744  					recovering = false
  4745  					if n.NeedSnapshot() {
  4746  						doSnapshot(true)
  4747  					}
  4748  				} else if err := js.applyConsumerEntries(o, ce, isLeader); err == nil {
  4749  					ne, nb := n.Applied(ce.Index)
  4750  					ce.ReturnToPool()
  4751  					// If we have at least min entries to compact, go ahead and snapshot/compact.
  4752  					if nb > 0 && ne >= compactNumMin || nb > compactSizeMin {
  4753  						doSnapshot(false)
  4754  					}
  4755  				} else if err != errConsumerClosed {
  4756  					s.Warnf("Error applying consumer entries to '%s > %s'", ca.Client.serviceAccount(), ca.Name)
  4757  				}
  4758  			}
  4759  			aq.recycle(&ces)
  4760  		case isLeader = <-lch:
  4761  			if recovering && !isLeader {
  4762  				js.setConsumerAssignmentRecovering(ca)
  4763  			}
  4764  
  4765  			// Process the change.
  4766  			if err := js.processConsumerLeaderChange(o, isLeader); err == nil && isLeader {
  4767  				// Check our state if we are under an interest based stream.
  4768  				o.checkStateForInterestStream()
  4769  				// Do a snapshot.
  4770  				doSnapshot(true)
  4771  				// Synchronize followers to our state. Only send out if we have state.
  4772  				if n != nil {
  4773  					if _, _, applied := n.Progress(); applied > 0 {
  4774  						if snap, err := o.store.EncodedState(); err == nil {
  4775  							n.SendSnapshot(snap)
  4776  						}
  4777  					}
  4778  				}
  4779  			}
  4780  
  4781  			// We may receive a leader change after the consumer assignment which would cancel us
  4782  			// monitoring for this closely. So re-assess our state here as well.
  4783  			// Or the old leader is no longer part of the set and transferred leadership
  4784  			// for this leader to resume with removal
  4785  			rg := o.raftGroup()
  4786  
  4787  			// Check for migrations (peer count and replica count differ) here.
  4788  			// We set the state on the stream assignment update below.
  4789  			replicas, err := o.replica()
  4790  			if err != nil {
  4791  				continue
  4792  			}
  4793  			if isLeader && len(rg.Peers) != replicas {
  4794  				startMigrationMonitoring()
  4795  			} else {
  4796  				stopMigrationMonitoring()
  4797  			}
  4798  		case <-uch:
  4799  			// keep consumer assignment current
  4800  			ca = o.consumerAssignment()
  4801  			// We get this when we have a new consumer assignment caused by an update.
  4802  			// We want to know if we are migrating.
  4803  			rg := o.raftGroup()
  4804  			// keep peer list up to date with config
  4805  			js.checkPeers(rg)
  4806  			// If we are migrating, monitor for the new peers to be caught up.
  4807  			replicas, err := o.replica()
  4808  			if err != nil {
  4809  				continue
  4810  			}
  4811  			if isLeader && len(rg.Peers) != replicas {
  4812  				startMigrationMonitoring()
  4813  			} else {
  4814  				stopMigrationMonitoring()
  4815  			}
  4816  		case <-mmtc:
  4817  			if !isLeader {
  4818  				// We are no longer leader, so not our job.
  4819  				stopMigrationMonitoring()
  4820  				continue
  4821  			}
  4822  			rg := o.raftGroup()
  4823  			ci := js.clusterInfo(rg)
  4824  			replicas, err := o.replica()
  4825  			if err != nil {
  4826  				continue
  4827  			}
  4828  			if len(rg.Peers) <= replicas {
  4829  				// Migration no longer happening, so not our job anymore
  4830  				stopMigrationMonitoring()
  4831  				continue
  4832  			}
  4833  			newPeers, oldPeers, newPeerSet, _ := genPeerInfo(rg.Peers, len(rg.Peers)-replicas)
  4834  
  4835  			// If we are part of the new peerset and we have been passed the baton.
  4836  			// We will handle scale down.
  4837  			if newPeerSet[ourPeerId] {
  4838  				for _, p := range oldPeers {
  4839  					n.ProposeRemovePeer(p)
  4840  				}
  4841  				cca := ca.copyGroup()
  4842  				cca.Group.Peers = newPeers
  4843  				cca.Group.Cluster = s.cachedClusterName()
  4844  				cc.meta.ForwardProposal(encodeAddConsumerAssignment(cca))
  4845  				s.Noticef("Scaling down '%s > %s > %s' to %+v", ca.Client.serviceAccount(), ca.Stream, ca.Name, s.peerSetToNames(newPeers))
  4846  
  4847  			} else {
  4848  				var newLeaderPeer, newLeader, newCluster string
  4849  				neededCurrent, current := replicas/2+1, 0
  4850  				for _, r := range ci.Replicas {
  4851  					if r.Current && newPeerSet[r.Peer] {
  4852  						current++
  4853  						if newCluster == _EMPTY_ {
  4854  							newLeaderPeer, newLeader, newCluster = r.Peer, r.Name, r.cluster
  4855  						}
  4856  					}
  4857  				}
  4858  
  4859  				// Check if we have a quorom
  4860  				if current >= neededCurrent {
  4861  					s.Noticef("Transfer of consumer leader for '%s > %s > %s' to '%s'", ca.Client.serviceAccount(), ca.Stream, ca.Name, newLeader)
  4862  					n.StepDown(newLeaderPeer)
  4863  				}
  4864  			}
  4865  
  4866  		case <-t.C:
  4867  			doSnapshot(false)
  4868  		}
  4869  	}
  4870  }
  4871  
  4872  func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLeader bool) error {
  4873  	for _, e := range ce.Entries {
  4874  		if e.Type == EntrySnapshot {
  4875  			if !isLeader {
  4876  				// No-op needed?
  4877  				state, err := decodeConsumerState(e.Data)
  4878  				if err != nil {
  4879  					if mset, node := o.streamAndNode(); mset != nil && node != nil {
  4880  						s := js.srv
  4881  						s.Errorf("JetStream cluster could not decode consumer snapshot for '%s > %s > %s' [%s]",
  4882  							mset.account(), mset.name(), o, node.Group())
  4883  					}
  4884  					panic(err.Error())
  4885  				}
  4886  				if err = o.store.Update(state); err != nil {
  4887  					o.mu.RLock()
  4888  					s, acc, mset, name := o.srv, o.acc, o.mset, o.name
  4889  					o.mu.RUnlock()
  4890  					if s != nil && mset != nil {
  4891  						s.Warnf("Consumer '%s > %s > %s' error on store update from snapshot entry: %v", acc, mset.name(), name, err)
  4892  					}
  4893  				}
  4894  				// Check our interest state if applicable.
  4895  				o.checkStateForInterestStream()
  4896  			}
  4897  
  4898  		} else if e.Type == EntryRemovePeer {
  4899  			js.mu.RLock()
  4900  			var ourID string
  4901  			if js.cluster != nil && js.cluster.meta != nil {
  4902  				ourID = js.cluster.meta.ID()
  4903  			}
  4904  			js.mu.RUnlock()
  4905  			if peer := string(e.Data); peer == ourID {
  4906  				shouldRemove := true
  4907  				if mset := o.getStream(); mset != nil {
  4908  					if sa := mset.streamAssignment(); sa != nil && sa.Group != nil {
  4909  						js.mu.RLock()
  4910  						shouldRemove = !sa.Group.isMember(ourID)
  4911  						js.mu.RUnlock()
  4912  					}
  4913  				}
  4914  				if shouldRemove {
  4915  					o.stopWithFlags(true, false, false, false)
  4916  				}
  4917  			}
  4918  			return nil
  4919  		} else if e.Type == EntryAddPeer {
  4920  			// Ignore for now.
  4921  		} else {
  4922  			buf := e.Data
  4923  			switch entryOp(buf[0]) {
  4924  			case updateDeliveredOp:
  4925  				// These are handled in place in leaders.
  4926  				if !isLeader {
  4927  					dseq, sseq, dc, ts, err := decodeDeliveredUpdate(buf[1:])
  4928  					if err != nil {
  4929  						if mset, node := o.streamAndNode(); mset != nil && node != nil {
  4930  							s := js.srv
  4931  							s.Errorf("JetStream cluster could not decode consumer delivered update for '%s > %s > %s' [%s]",
  4932  								mset.account(), mset.name(), o, node.Group())
  4933  						}
  4934  						panic(err.Error())
  4935  					}
  4936  					// Make sure to update delivered under the lock.
  4937  					o.mu.Lock()
  4938  					err = o.store.UpdateDelivered(dseq, sseq, dc, ts)
  4939  					o.ldt = time.Now()
  4940  					o.mu.Unlock()
  4941  					if err != nil {
  4942  						panic(err.Error())
  4943  					}
  4944  				}
  4945  			case updateAcksOp:
  4946  				dseq, sseq, err := decodeAckUpdate(buf[1:])
  4947  				if err != nil {
  4948  					if mset, node := o.streamAndNode(); mset != nil && node != nil {
  4949  						s := js.srv
  4950  						s.Errorf("JetStream cluster could not decode consumer ack update for '%s > %s > %s' [%s]",
  4951  							mset.account(), mset.name(), o, node.Group())
  4952  					}
  4953  					panic(err.Error())
  4954  				}
  4955  				if err := o.processReplicatedAck(dseq, sseq); err == errConsumerClosed {
  4956  					return err
  4957  				}
  4958  			case updateSkipOp:
  4959  				o.mu.Lock()
  4960  				if !o.isLeader() {
  4961  					var le = binary.LittleEndian
  4962  					if sseq := le.Uint64(buf[1:]); sseq > o.sseq {
  4963  						o.sseq = sseq
  4964  					}
  4965  				}
  4966  				o.mu.Unlock()
  4967  			case addPendingRequest:
  4968  				o.mu.Lock()
  4969  				if !o.isLeader() {
  4970  					if o.prm == nil {
  4971  						o.prm = make(map[string]struct{})
  4972  					}
  4973  					o.prm[string(buf[1:])] = struct{}{}
  4974  				}
  4975  				o.mu.Unlock()
  4976  			case removePendingRequest:
  4977  				o.mu.Lock()
  4978  				if !o.isLeader() {
  4979  					if o.prm != nil {
  4980  						delete(o.prm, string(buf[1:]))
  4981  					}
  4982  				}
  4983  				o.mu.Unlock()
  4984  			default:
  4985  				panic(fmt.Sprintf("JetStream Cluster Unknown group entry op type: %v", entryOp(buf[0])))
  4986  			}
  4987  		}
  4988  	}
  4989  	return nil
  4990  }
  4991  
  4992  var errConsumerClosed = errors.New("consumer closed")
  4993  
  4994  func (o *consumer) processReplicatedAck(dseq, sseq uint64) error {
  4995  	o.mu.Lock()
  4996  	mset := o.mset
  4997  	if o.closed || mset == nil {
  4998  		o.mu.Unlock()
  4999  		return errConsumerClosed
  5000  	}
  5001  	if mset.closed.Load() {
  5002  		o.mu.Unlock()
  5003  		return errStreamClosed
  5004  	}
  5005  
  5006  	// Update activity.
  5007  	o.lat = time.Now()
  5008  
  5009  	// Do actual ack update to store.
  5010  	o.store.UpdateAcks(dseq, sseq)
  5011  
  5012  	if o.retention == LimitsPolicy {
  5013  		o.mu.Unlock()
  5014  		return nil
  5015  	}
  5016  
  5017  	var sagap uint64
  5018  	if o.cfg.AckPolicy == AckAll {
  5019  		if o.isLeader() {
  5020  			sagap = sseq - o.asflr
  5021  		} else {
  5022  			// We are a follower so only have the store state, so read that in.
  5023  			state, err := o.store.State()
  5024  			if err != nil {
  5025  				o.mu.Unlock()
  5026  				return err
  5027  			}
  5028  			sagap = sseq - state.AckFloor.Stream
  5029  		}
  5030  	}
  5031  	o.mu.Unlock()
  5032  
  5033  	if sagap > 1 {
  5034  		// FIXME(dlc) - This is very inefficient, will need to fix.
  5035  		for seq := sseq; seq > sseq-sagap; seq-- {
  5036  			mset.ackMsg(o, seq)
  5037  		}
  5038  	} else {
  5039  		mset.ackMsg(o, sseq)
  5040  	}
  5041  	return nil
  5042  }
  5043  
  5044  var errBadAckUpdate = errors.New("jetstream cluster bad replicated ack update")
  5045  var errBadDeliveredUpdate = errors.New("jetstream cluster bad replicated delivered update")
  5046  
  5047  func decodeAckUpdate(buf []byte) (dseq, sseq uint64, err error) {
  5048  	var bi, n int
  5049  	if dseq, n = binary.Uvarint(buf); n < 0 {
  5050  		return 0, 0, errBadAckUpdate
  5051  	}
  5052  	bi += n
  5053  	if sseq, n = binary.Uvarint(buf[bi:]); n < 0 {
  5054  		return 0, 0, errBadAckUpdate
  5055  	}
  5056  	return dseq, sseq, nil
  5057  }
  5058  
  5059  func decodeDeliveredUpdate(buf []byte) (dseq, sseq, dc uint64, ts int64, err error) {
  5060  	var bi, n int
  5061  	if dseq, n = binary.Uvarint(buf); n < 0 {
  5062  		return 0, 0, 0, 0, errBadDeliveredUpdate
  5063  	}
  5064  	bi += n
  5065  	if sseq, n = binary.Uvarint(buf[bi:]); n < 0 {
  5066  		return 0, 0, 0, 0, errBadDeliveredUpdate
  5067  	}
  5068  	bi += n
  5069  	if dc, n = binary.Uvarint(buf[bi:]); n < 0 {
  5070  		return 0, 0, 0, 0, errBadDeliveredUpdate
  5071  	}
  5072  	bi += n
  5073  	if ts, n = binary.Varint(buf[bi:]); n < 0 {
  5074  		return 0, 0, 0, 0, errBadDeliveredUpdate
  5075  	}
  5076  	return dseq, sseq, dc, ts, nil
  5077  }
  5078  
  5079  func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) error {
  5080  	stepDownIfLeader := func() error {
  5081  		if node := o.raftNode(); node != nil && isLeader {
  5082  			node.StepDown()
  5083  		}
  5084  		return errors.New("failed to update consumer leader status")
  5085  	}
  5086  
  5087  	if o == nil || o.isClosed() {
  5088  		return stepDownIfLeader()
  5089  	}
  5090  
  5091  	ca := o.consumerAssignment()
  5092  	if ca == nil {
  5093  		return stepDownIfLeader()
  5094  	}
  5095  	js.mu.Lock()
  5096  	s, account, err := js.srv, ca.Client.serviceAccount(), ca.err
  5097  	client, subject, reply, streamName, consumerName := ca.Client, ca.Subject, ca.Reply, ca.Stream, ca.Name
  5098  	hasResponded := ca.responded
  5099  	ca.responded = true
  5100  	js.mu.Unlock()
  5101  
  5102  	acc, _ := s.LookupAccount(account)
  5103  	if acc == nil {
  5104  		return stepDownIfLeader()
  5105  	}
  5106  
  5107  	if isLeader {
  5108  		s.Noticef("JetStream cluster new consumer leader for '%s > %s > %s'", ca.Client.serviceAccount(), streamName, consumerName)
  5109  		s.sendConsumerLeaderElectAdvisory(o)
  5110  		// Check for peer removal and process here if needed.
  5111  		js.checkPeers(ca.Group)
  5112  	} else {
  5113  		// We are stepping down.
  5114  		// Make sure if we are doing so because we have lost quorum that we send the appropriate advisories.
  5115  		if node := o.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second {
  5116  			s.sendConsumerLostQuorumAdvisory(o)
  5117  		}
  5118  	}
  5119  
  5120  	// Tell consumer to switch leader status.
  5121  	o.setLeader(isLeader)
  5122  
  5123  	if !isLeader || hasResponded {
  5124  		if isLeader {
  5125  			o.clearInitialInfo()
  5126  		}
  5127  		return nil
  5128  	}
  5129  
  5130  	var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}
  5131  	if err != nil {
  5132  		resp.Error = NewJSConsumerCreateError(err, Unless(err))
  5133  		s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  5134  	} else {
  5135  		resp.ConsumerInfo = o.initialInfo()
  5136  		s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
  5137  		if node := o.raftNode(); node != nil {
  5138  			o.sendCreateAdvisory()
  5139  		}
  5140  	}
  5141  
  5142  	// Only send a pause advisory on consumer create if we're
  5143  	// actually paused. The timer would have been kicked by now
  5144  	// by the call to o.setLeader() above.
  5145  	if isLeader && o.cfg.PauseUntil != nil && !o.cfg.PauseUntil.IsZero() && time.Now().Before(*o.cfg.PauseUntil) {
  5146  		o.sendPauseAdvisoryLocked(&o.cfg)
  5147  	}
  5148  
  5149  	return nil
  5150  }
  5151  
  5152  // Determines if we should send lost quorum advisory. We throttle these after first one.
  5153  func (o *consumer) shouldSendLostQuorum() bool {
  5154  	o.mu.Lock()
  5155  	defer o.mu.Unlock()
  5156  	if time.Since(o.lqsent) >= lostQuorumAdvInterval {
  5157  		o.lqsent = time.Now()
  5158  		return true
  5159  	}
  5160  	return false
  5161  }
  5162  
  5163  func (s *Server) sendConsumerLostQuorumAdvisory(o *consumer) {
  5164  	if o == nil {
  5165  		return
  5166  	}
  5167  	node, stream, consumer, acc := o.raftNode(), o.streamName(), o.String(), o.account()
  5168  	if node == nil {
  5169  		return
  5170  	}
  5171  	if !o.shouldSendLostQuorum() {
  5172  		return
  5173  	}
  5174  
  5175  	s.Warnf("JetStream cluster consumer '%s > %s > %s' has NO quorum, stalled.", acc.GetName(), stream, consumer)
  5176  
  5177  	subj := JSAdvisoryConsumerQuorumLostPre + "." + stream + "." + consumer
  5178  	adv := &JSConsumerQuorumLostAdvisory{
  5179  		TypedEvent: TypedEvent{
  5180  			Type: JSConsumerQuorumLostAdvisoryType,
  5181  			ID:   nuid.Next(),
  5182  			Time: time.Now().UTC(),
  5183  		},
  5184  		Stream:   stream,
  5185  		Consumer: consumer,
  5186  		Replicas: s.replicas(node),
  5187  		Domain:   s.getOpts().JetStreamDomain,
  5188  	}
  5189  
  5190  	// Send to the user's account if not the system account.
  5191  	if acc != s.SystemAccount() {
  5192  		s.publishAdvisory(acc, subj, adv)
  5193  	}
  5194  	// Now do system level one. Place account info in adv, and nil account means system.
  5195  	adv.Account = acc.GetName()
  5196  	s.publishAdvisory(nil, subj, adv)
  5197  }
  5198  
  5199  func (s *Server) sendConsumerLeaderElectAdvisory(o *consumer) {
  5200  	if o == nil {
  5201  		return
  5202  	}
  5203  	node, stream, consumer, acc := o.raftNode(), o.streamName(), o.String(), o.account()
  5204  	if node == nil {
  5205  		return
  5206  	}
  5207  
  5208  	subj := JSAdvisoryConsumerLeaderElectedPre + "." + stream + "." + consumer
  5209  	adv := &JSConsumerLeaderElectedAdvisory{
  5210  		TypedEvent: TypedEvent{
  5211  			Type: JSConsumerLeaderElectedAdvisoryType,
  5212  			ID:   nuid.Next(),
  5213  			Time: time.Now().UTC(),
  5214  		},
  5215  		Stream:   stream,
  5216  		Consumer: consumer,
  5217  		Leader:   s.serverNameForNode(node.GroupLeader()),
  5218  		Replicas: s.replicas(node),
  5219  		Domain:   s.getOpts().JetStreamDomain,
  5220  	}
  5221  
  5222  	// Send to the user's account if not the system account.
  5223  	if acc != s.SystemAccount() {
  5224  		s.publishAdvisory(acc, subj, adv)
  5225  	}
  5226  	// Now do system level one. Place account info in adv, and nil account means system.
  5227  	adv.Account = acc.GetName()
  5228  	s.publishAdvisory(nil, subj, adv)
  5229  }
  5230  
  5231  type streamAssignmentResult struct {
  5232  	Account  string                      `json:"account"`
  5233  	Stream   string                      `json:"stream"`
  5234  	Response *JSApiStreamCreateResponse  `json:"create_response,omitempty"`
  5235  	Restore  *JSApiStreamRestoreResponse `json:"restore_response,omitempty"`
  5236  	Update   bool                        `json:"is_update,omitempty"`
  5237  }
  5238  
  5239  // Determine if this is an insufficient resources' error type.
  5240  func isInsufficientResourcesErr(resp *JSApiStreamCreateResponse) bool {
  5241  	return resp != nil && resp.Error != nil && IsNatsErr(resp.Error, JSInsufficientResourcesErr, JSMemoryResourcesExceededErr, JSStorageResourcesExceededErr)
  5242  }
  5243  
  5244  // Process error results of stream and consumer assignments.
  5245  // Success will be handled by stream leader.
  5246  func (js *jetStream) processStreamAssignmentResults(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  5247  	var result streamAssignmentResult
  5248  	if err := json.Unmarshal(msg, &result); err != nil {
  5249  		// TODO(dlc) - log
  5250  		return
  5251  	}
  5252  	acc, _ := js.srv.LookupAccount(result.Account)
  5253  	if acc == nil {
  5254  		// TODO(dlc) - log
  5255  		return
  5256  	}
  5257  
  5258  	js.mu.Lock()
  5259  	defer js.mu.Unlock()
  5260  
  5261  	s, cc := js.srv, js.cluster
  5262  	if cc == nil || cc.meta == nil {
  5263  		return
  5264  	}
  5265  
  5266  	// This should have been done already in processStreamAssignment, but in
  5267  	// case we have a code path that gets here with no processStreamAssignment,
  5268  	// then we will do the proper thing. Otherwise will be a no-op.
  5269  	cc.removeInflightProposal(result.Account, result.Stream)
  5270  
  5271  	// FIXME(dlc) - suppress duplicates?
  5272  	if sa := js.streamAssignment(result.Account, result.Stream); sa != nil {
  5273  		canDelete := !result.Update && time.Since(sa.Created) < 5*time.Second
  5274  
  5275  		// See if we should retry in case this cluster is full but there are others.
  5276  		if cfg, ci := sa.Config, sa.Client; cfg != nil && ci != nil && isInsufficientResourcesErr(result.Response) && canDelete {
  5277  			// If cluster is defined we can not retry.
  5278  			if cfg.Placement == nil || cfg.Placement.Cluster == _EMPTY_ {
  5279  				// If we have additional clusters to try we can retry.
  5280  				// We have already verified that ci != nil.
  5281  				if len(ci.Alternates) > 0 {
  5282  					if rg, err := js.createGroupForStream(ci, cfg); err != nil {
  5283  						s.Warnf("Retrying cluster placement for stream '%s > %s' failed due to placement error: %+v", result.Account, result.Stream, err)
  5284  					} else {
  5285  						if org := sa.Group; org != nil && len(org.Peers) > 0 {
  5286  							s.Warnf("Retrying cluster placement for stream '%s > %s' due to insufficient resources in cluster %q",
  5287  								result.Account, result.Stream, s.clusterNameForNode(org.Peers[0]))
  5288  						} else {
  5289  							s.Warnf("Retrying cluster placement for stream '%s > %s' due to insufficient resources", result.Account, result.Stream)
  5290  						}
  5291  						// Pick a new preferred leader.
  5292  						rg.setPreferred()
  5293  						// Get rid of previous attempt.
  5294  						cc.meta.Propose(encodeDeleteStreamAssignment(sa))
  5295  						// Propose new.
  5296  						sa.Group, sa.err = rg, nil
  5297  						cc.meta.Propose(encodeAddStreamAssignment(sa))
  5298  						return
  5299  					}
  5300  				}
  5301  			}
  5302  		}
  5303  
  5304  		// Respond to the user here.
  5305  		var resp string
  5306  		if result.Response != nil {
  5307  			resp = s.jsonResponse(result.Response)
  5308  		} else if result.Restore != nil {
  5309  			resp = s.jsonResponse(result.Restore)
  5310  		}
  5311  		if !sa.responded || result.Update {
  5312  			sa.responded = true
  5313  			js.srv.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, resp)
  5314  		}
  5315  		// Remove this assignment if possible.
  5316  		if canDelete {
  5317  			sa.err = NewJSClusterNotAssignedError()
  5318  			cc.meta.Propose(encodeDeleteStreamAssignment(sa))
  5319  		}
  5320  	}
  5321  }
  5322  
  5323  func (js *jetStream) processConsumerAssignmentResults(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  5324  	var result consumerAssignmentResult
  5325  	if err := json.Unmarshal(msg, &result); err != nil {
  5326  		// TODO(dlc) - log
  5327  		return
  5328  	}
  5329  	acc, _ := js.srv.LookupAccount(result.Account)
  5330  	if acc == nil {
  5331  		// TODO(dlc) - log
  5332  		return
  5333  	}
  5334  
  5335  	js.mu.Lock()
  5336  	defer js.mu.Unlock()
  5337  
  5338  	s, cc := js.srv, js.cluster
  5339  	if cc == nil || cc.meta == nil {
  5340  		return
  5341  	}
  5342  
  5343  	if sa := js.streamAssignment(result.Account, result.Stream); sa != nil && sa.consumers != nil {
  5344  		if ca := sa.consumers[result.Consumer]; ca != nil && !ca.responded {
  5345  			js.srv.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(result.Response))
  5346  			ca.responded = true
  5347  
  5348  			// Check if this failed.
  5349  			// TODO(dlc) - Could have mixed results, should track per peer.
  5350  			// Make sure this is recent response, do not delete existing consumers.
  5351  			if result.Response.Error != nil && result.Response.Error != NewJSConsumerNameExistError() && time.Since(ca.Created) < 2*time.Second {
  5352  				// So while we are deleting we will not respond to list/names requests.
  5353  				ca.err = NewJSClusterNotAssignedError()
  5354  				cc.meta.Propose(encodeDeleteConsumerAssignment(ca))
  5355  				s.Warnf("Proposing to delete consumer `%s > %s > %s' due to assignment response error: %v",
  5356  					result.Account, result.Stream, result.Consumer, result.Response.Error)
  5357  			}
  5358  		}
  5359  	}
  5360  }
  5361  
  5362  const (
  5363  	streamAssignmentSubj   = "$SYS.JSC.STREAM.ASSIGNMENT.RESULT"
  5364  	consumerAssignmentSubj = "$SYS.JSC.CONSUMER.ASSIGNMENT.RESULT"
  5365  )
  5366  
  5367  // Lock should be held.
  5368  func (js *jetStream) startUpdatesSub() {
  5369  	cc, s, c := js.cluster, js.srv, js.cluster.c
  5370  	if cc.streamResults == nil {
  5371  		cc.streamResults, _ = s.systemSubscribe(streamAssignmentSubj, _EMPTY_, false, c, js.processStreamAssignmentResults)
  5372  	}
  5373  	if cc.consumerResults == nil {
  5374  		cc.consumerResults, _ = s.systemSubscribe(consumerAssignmentSubj, _EMPTY_, false, c, js.processConsumerAssignmentResults)
  5375  	}
  5376  	if cc.stepdown == nil {
  5377  		cc.stepdown, _ = s.systemSubscribe(JSApiLeaderStepDown, _EMPTY_, false, c, s.jsLeaderStepDownRequest)
  5378  	}
  5379  	if cc.peerRemove == nil {
  5380  		cc.peerRemove, _ = s.systemSubscribe(JSApiRemoveServer, _EMPTY_, false, c, s.jsLeaderServerRemoveRequest)
  5381  	}
  5382  	if cc.peerStreamMove == nil {
  5383  		cc.peerStreamMove, _ = s.systemSubscribe(JSApiServerStreamMove, _EMPTY_, false, c, s.jsLeaderServerStreamMoveRequest)
  5384  	}
  5385  	if cc.peerStreamCancelMove == nil {
  5386  		cc.peerStreamCancelMove, _ = s.systemSubscribe(JSApiServerStreamCancelMove, _EMPTY_, false, c, s.jsLeaderServerStreamCancelMoveRequest)
  5387  	}
  5388  	if js.accountPurge == nil {
  5389  		js.accountPurge, _ = s.systemSubscribe(JSApiAccountPurge, _EMPTY_, false, c, s.jsLeaderAccountPurgeRequest)
  5390  	}
  5391  }
  5392  
  5393  // Lock should be held.
  5394  func (js *jetStream) stopUpdatesSub() {
  5395  	cc := js.cluster
  5396  	if cc.streamResults != nil {
  5397  		cc.s.sysUnsubscribe(cc.streamResults)
  5398  		cc.streamResults = nil
  5399  	}
  5400  	if cc.consumerResults != nil {
  5401  		cc.s.sysUnsubscribe(cc.consumerResults)
  5402  		cc.consumerResults = nil
  5403  	}
  5404  	if cc.stepdown != nil {
  5405  		cc.s.sysUnsubscribe(cc.stepdown)
  5406  		cc.stepdown = nil
  5407  	}
  5408  	if cc.peerRemove != nil {
  5409  		cc.s.sysUnsubscribe(cc.peerRemove)
  5410  		cc.peerRemove = nil
  5411  	}
  5412  	if cc.peerStreamMove != nil {
  5413  		cc.s.sysUnsubscribe(cc.peerStreamMove)
  5414  		cc.peerStreamMove = nil
  5415  	}
  5416  	if cc.peerStreamCancelMove != nil {
  5417  		cc.s.sysUnsubscribe(cc.peerStreamCancelMove)
  5418  		cc.peerStreamCancelMove = nil
  5419  	}
  5420  	if js.accountPurge != nil {
  5421  		cc.s.sysUnsubscribe(js.accountPurge)
  5422  		js.accountPurge = nil
  5423  	}
  5424  }
  5425  
  5426  func (s *Server) sendDomainLeaderElectAdvisory() {
  5427  	js, cc := s.getJetStreamCluster()
  5428  	if js == nil || cc == nil {
  5429  		return
  5430  	}
  5431  
  5432  	js.mu.RLock()
  5433  	node := cc.meta
  5434  	js.mu.RUnlock()
  5435  
  5436  	adv := &JSDomainLeaderElectedAdvisory{
  5437  		TypedEvent: TypedEvent{
  5438  			Type: JSDomainLeaderElectedAdvisoryType,
  5439  			ID:   nuid.Next(),
  5440  			Time: time.Now().UTC(),
  5441  		},
  5442  		Leader:   node.GroupLeader(),
  5443  		Replicas: s.replicas(node),
  5444  		Cluster:  s.cachedClusterName(),
  5445  		Domain:   s.getOpts().JetStreamDomain,
  5446  	}
  5447  
  5448  	s.publishAdvisory(nil, JSAdvisoryDomainLeaderElected, adv)
  5449  }
  5450  
  5451  func (js *jetStream) processLeaderChange(isLeader bool) {
  5452  	if js == nil {
  5453  		return
  5454  	}
  5455  	s := js.srv
  5456  	if s == nil {
  5457  		return
  5458  	}
  5459  	// Update our server atomic.
  5460  	s.isMetaLeader.Store(isLeader)
  5461  
  5462  	if isLeader {
  5463  		s.Noticef("Self is new JetStream cluster metadata leader")
  5464  		s.sendDomainLeaderElectAdvisory()
  5465  	} else {
  5466  		var node string
  5467  		if meta := js.getMetaGroup(); meta != nil {
  5468  			node = meta.GroupLeader()
  5469  		}
  5470  		if node == _EMPTY_ {
  5471  			s.Noticef("JetStream cluster no metadata leader")
  5472  		} else if srv := js.srv.serverNameForNode(node); srv == _EMPTY_ {
  5473  			s.Noticef("JetStream cluster new remote metadata leader")
  5474  		} else if clst := js.srv.clusterNameForNode(node); clst == _EMPTY_ {
  5475  			s.Noticef("JetStream cluster new metadata leader: %s", srv)
  5476  		} else {
  5477  			s.Noticef("JetStream cluster new metadata leader: %s/%s", srv, clst)
  5478  		}
  5479  	}
  5480  
  5481  	js.mu.Lock()
  5482  	defer js.mu.Unlock()
  5483  
  5484  	if isLeader {
  5485  		js.startUpdatesSub()
  5486  	} else {
  5487  		js.stopUpdatesSub()
  5488  		// TODO(dlc) - stepdown.
  5489  	}
  5490  
  5491  	// If we have been signaled to check the streams, this is for a bug that left stream
  5492  	// assignments with no sync subject after an update and no way to sync/catchup outside of the RAFT layer.
  5493  	if isLeader && js.cluster.streamsCheck {
  5494  		cc := js.cluster
  5495  		for acc, asa := range cc.streams {
  5496  			for _, sa := range asa {
  5497  				if sa.Sync == _EMPTY_ {
  5498  					s.Warnf("Stream assigment corrupt for stream '%s > %s'", acc, sa.Config.Name)
  5499  					nsa := &streamAssignment{Group: sa.Group, Config: sa.Config, Subject: sa.Subject, Reply: sa.Reply, Client: sa.Client}
  5500  					nsa.Sync = syncSubjForStream()
  5501  					cc.meta.Propose(encodeUpdateStreamAssignment(nsa))
  5502  				}
  5503  			}
  5504  		}
  5505  		// Clear check.
  5506  		cc.streamsCheck = false
  5507  	}
  5508  }
  5509  
  5510  // Lock should be held.
  5511  func (cc *jetStreamCluster) remapStreamAssignment(sa *streamAssignment, removePeer string) bool {
  5512  	// Invoke placement algo passing RG peers that stay (existing) and the peer that is being removed (ignore)
  5513  	var retain, ignore []string
  5514  	for _, v := range sa.Group.Peers {
  5515  		if v == removePeer {
  5516  			ignore = append(ignore, v)
  5517  		} else {
  5518  			retain = append(retain, v)
  5519  		}
  5520  	}
  5521  
  5522  	newPeers, placementError := cc.selectPeerGroup(len(sa.Group.Peers), sa.Group.Cluster, sa.Config, retain, 0, ignore)
  5523  
  5524  	if placementError == nil {
  5525  		sa.Group.Peers = newPeers
  5526  		// Don't influence preferred leader.
  5527  		sa.Group.Preferred = _EMPTY_
  5528  		return true
  5529  	}
  5530  
  5531  	// If R1 just return to avoid bricking the stream.
  5532  	if sa.Group.node == nil || len(sa.Group.Peers) == 1 {
  5533  		return false
  5534  	}
  5535  
  5536  	// If we are here let's remove the peer at least, as long as we are R>1
  5537  	for i, peer := range sa.Group.Peers {
  5538  		if peer == removePeer {
  5539  			sa.Group.Peers[i] = sa.Group.Peers[len(sa.Group.Peers)-1]
  5540  			sa.Group.Peers = sa.Group.Peers[:len(sa.Group.Peers)-1]
  5541  			break
  5542  		}
  5543  	}
  5544  	return false
  5545  }
  5546  
  5547  type selectPeerError struct {
  5548  	excludeTag  bool
  5549  	offline     bool
  5550  	noStorage   bool
  5551  	uniqueTag   bool
  5552  	misc        bool
  5553  	noJsClust   bool
  5554  	noMatchTags map[string]struct{}
  5555  }
  5556  
  5557  func (e *selectPeerError) Error() string {
  5558  	b := strings.Builder{}
  5559  	writeBoolErrReason := func(hasErr bool, errMsg string) {
  5560  		if !hasErr {
  5561  			return
  5562  		}
  5563  		b.WriteString(", ")
  5564  		b.WriteString(errMsg)
  5565  	}
  5566  	b.WriteString("no suitable peers for placement")
  5567  	writeBoolErrReason(e.offline, "peer offline")
  5568  	writeBoolErrReason(e.excludeTag, "exclude tag set")
  5569  	writeBoolErrReason(e.noStorage, "insufficient storage")
  5570  	writeBoolErrReason(e.uniqueTag, "server tag not unique")
  5571  	writeBoolErrReason(e.misc, "miscellaneous issue")
  5572  	writeBoolErrReason(e.noJsClust, "jetstream not enabled in cluster")
  5573  	if len(e.noMatchTags) != 0 {
  5574  		b.WriteString(", tags not matched [")
  5575  		var firstTagWritten bool
  5576  		for tag := range e.noMatchTags {
  5577  			if firstTagWritten {
  5578  				b.WriteString(", ")
  5579  			}
  5580  			firstTagWritten = true
  5581  			b.WriteRune('\'')
  5582  			b.WriteString(tag)
  5583  			b.WriteRune('\'')
  5584  		}
  5585  		b.WriteString("]")
  5586  	}
  5587  	return b.String()
  5588  }
  5589  
  5590  func (e *selectPeerError) addMissingTag(t string) {
  5591  	if e.noMatchTags == nil {
  5592  		e.noMatchTags = map[string]struct{}{}
  5593  	}
  5594  	e.noMatchTags[t] = struct{}{}
  5595  }
  5596  
  5597  func (e *selectPeerError) accumulate(eAdd *selectPeerError) {
  5598  	if eAdd == nil {
  5599  		return
  5600  	}
  5601  	acc := func(val *bool, valAdd bool) {
  5602  		if valAdd {
  5603  			*val = valAdd
  5604  		}
  5605  	}
  5606  	acc(&e.offline, eAdd.offline)
  5607  	acc(&e.excludeTag, eAdd.excludeTag)
  5608  	acc(&e.noStorage, eAdd.noStorage)
  5609  	acc(&e.uniqueTag, eAdd.uniqueTag)
  5610  	acc(&e.misc, eAdd.misc)
  5611  	acc(&e.noJsClust, eAdd.noJsClust)
  5612  	for tag := range eAdd.noMatchTags {
  5613  		e.addMissingTag(tag)
  5614  	}
  5615  }
  5616  
  5617  // selectPeerGroup will select a group of peers to start a raft group.
  5618  // when peers exist already the unique tag prefix check for the replaceFirstExisting will be skipped
  5619  // js lock should be held.
  5620  func (cc *jetStreamCluster) selectPeerGroup(r int, cluster string, cfg *StreamConfig, existing []string, replaceFirstExisting int, ignore []string) ([]string, *selectPeerError) {
  5621  	if cluster == _EMPTY_ || cfg == nil {
  5622  		return nil, &selectPeerError{misc: true}
  5623  	}
  5624  
  5625  	var maxBytes uint64
  5626  	if cfg.MaxBytes > 0 {
  5627  		maxBytes = uint64(cfg.MaxBytes)
  5628  	}
  5629  
  5630  	// Check for tags.
  5631  	var tags []string
  5632  	if cfg.Placement != nil && len(cfg.Placement.Tags) > 0 {
  5633  		tags = cfg.Placement.Tags
  5634  	}
  5635  
  5636  	// Used for weighted sorting based on availability.
  5637  	type wn struct {
  5638  		id    string
  5639  		avail uint64
  5640  		ha    int
  5641  		ns    int
  5642  	}
  5643  
  5644  	var nodes []wn
  5645  	// peers is a randomized list
  5646  	s, peers := cc.s, cc.meta.Peers()
  5647  
  5648  	uniqueTagPrefix := s.getOpts().JetStreamUniqueTag
  5649  	if uniqueTagPrefix != _EMPTY_ {
  5650  		for _, tag := range tags {
  5651  			if strings.HasPrefix(tag, uniqueTagPrefix) {
  5652  				// disable uniqueness check if explicitly listed in tags
  5653  				uniqueTagPrefix = _EMPTY_
  5654  				break
  5655  			}
  5656  		}
  5657  	}
  5658  	var uniqueTags = make(map[string]*nodeInfo)
  5659  
  5660  	checkUniqueTag := func(ni *nodeInfo) (bool, *nodeInfo) {
  5661  		for _, t := range ni.tags {
  5662  			if strings.HasPrefix(t, uniqueTagPrefix) {
  5663  				if n, ok := uniqueTags[t]; !ok {
  5664  					uniqueTags[t] = ni
  5665  					return true, ni
  5666  				} else {
  5667  					return false, n
  5668  				}
  5669  			}
  5670  		}
  5671  		// default requires the unique prefix to be present
  5672  		return false, nil
  5673  	}
  5674  
  5675  	// Map existing.
  5676  	var ep map[string]struct{}
  5677  	if le := len(existing); le > 0 {
  5678  		if le >= r {
  5679  			return existing[:r], nil
  5680  		}
  5681  		ep = make(map[string]struct{})
  5682  		for i, p := range existing {
  5683  			ep[p] = struct{}{}
  5684  			if uniqueTagPrefix == _EMPTY_ {
  5685  				continue
  5686  			}
  5687  			si, ok := s.nodeToInfo.Load(p)
  5688  			if !ok || si == nil || i < replaceFirstExisting {
  5689  				continue
  5690  			}
  5691  			ni := si.(nodeInfo)
  5692  			// collect unique tags, but do not require them as this node is already part of the peerset
  5693  			checkUniqueTag(&ni)
  5694  		}
  5695  	}
  5696  
  5697  	// Map ignore
  5698  	var ip map[string]struct{}
  5699  	if li := len(ignore); li > 0 {
  5700  		ip = make(map[string]struct{})
  5701  		for _, p := range ignore {
  5702  			ip[p] = struct{}{}
  5703  		}
  5704  	}
  5705  
  5706  	// Grab the number of streams and HA assets currently assigned to each peer.
  5707  	// HAAssets under usage is async, so calculate here in realtime based on assignments.
  5708  	peerStreams := make(map[string]int, len(peers))
  5709  	peerHA := make(map[string]int, len(peers))
  5710  	for _, asa := range cc.streams {
  5711  		for _, sa := range asa {
  5712  			isHA := len(sa.Group.Peers) > 1
  5713  			for _, peer := range sa.Group.Peers {
  5714  				peerStreams[peer]++
  5715  				if isHA {
  5716  					peerHA[peer]++
  5717  				}
  5718  			}
  5719  		}
  5720  	}
  5721  
  5722  	maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets
  5723  
  5724  	// An error is a result of multiple individual placement decisions.
  5725  	// Which is why we keep taps on how often which one happened.
  5726  	err := selectPeerError{}
  5727  
  5728  	// Shuffle them up.
  5729  	rand.Shuffle(len(peers), func(i, j int) { peers[i], peers[j] = peers[j], peers[i] })
  5730  	for _, p := range peers {
  5731  		si, ok := s.nodeToInfo.Load(p.ID)
  5732  		if !ok || si == nil {
  5733  			err.misc = true
  5734  			continue
  5735  		}
  5736  		ni := si.(nodeInfo)
  5737  		// Only select from the designated named cluster.
  5738  		if ni.cluster != cluster {
  5739  			s.Debugf("Peer selection: discard %s@%s reason: not target cluster %s", ni.name, ni.cluster, cluster)
  5740  			continue
  5741  		}
  5742  
  5743  		// If we know its offline or we do not have config or err don't consider.
  5744  		if ni.offline || ni.cfg == nil || ni.stats == nil {
  5745  			s.Debugf("Peer selection: discard %s@%s reason: offline", ni.name, ni.cluster)
  5746  			err.offline = true
  5747  			continue
  5748  		}
  5749  
  5750  		// If ignore skip
  5751  		if _, ok := ip[p.ID]; ok {
  5752  			continue
  5753  		}
  5754  
  5755  		// If existing also skip, we will add back in to front of the list when done.
  5756  		if _, ok := ep[p.ID]; ok {
  5757  			continue
  5758  		}
  5759  
  5760  		if ni.tags.Contains(jsExcludePlacement) {
  5761  			s.Debugf("Peer selection: discard %s@%s tags: %v reason: %s present",
  5762  				ni.name, ni.cluster, ni.tags, jsExcludePlacement)
  5763  			err.excludeTag = true
  5764  			continue
  5765  		}
  5766  
  5767  		if len(tags) > 0 {
  5768  			matched := true
  5769  			for _, t := range tags {
  5770  				if !ni.tags.Contains(t) {
  5771  					matched = false
  5772  					s.Debugf("Peer selection: discard %s@%s tags: %v reason: mandatory tag %s not present",
  5773  						ni.name, ni.cluster, ni.tags, t)
  5774  					err.addMissingTag(t)
  5775  					break
  5776  				}
  5777  			}
  5778  			if !matched {
  5779  				continue
  5780  			}
  5781  		}
  5782  
  5783  		var available uint64
  5784  		if ni.stats != nil {
  5785  			switch cfg.Storage {
  5786  			case MemoryStorage:
  5787  				used := ni.stats.ReservedMemory
  5788  				if ni.stats.Memory > used {
  5789  					used = ni.stats.Memory
  5790  				}
  5791  				if ni.cfg.MaxMemory > int64(used) {
  5792  					available = uint64(ni.cfg.MaxMemory) - used
  5793  				}
  5794  			case FileStorage:
  5795  				used := ni.stats.ReservedStore
  5796  				if ni.stats.Store > used {
  5797  					used = ni.stats.Store
  5798  				}
  5799  				if ni.cfg.MaxStore > int64(used) {
  5800  					available = uint64(ni.cfg.MaxStore) - used
  5801  				}
  5802  			}
  5803  		}
  5804  
  5805  		// Otherwise check if we have enough room if maxBytes set.
  5806  		if maxBytes > 0 && maxBytes > available {
  5807  			s.Warnf("Peer selection: discard %s@%s (Max Bytes: %d) exceeds available %s storage of %d bytes",
  5808  				ni.name, ni.cluster, maxBytes, cfg.Storage.String(), available)
  5809  			err.noStorage = true
  5810  			continue
  5811  		}
  5812  		// HAAssets contain _meta_ which we want to ignore, hence > and not >=.
  5813  		if maxHaAssets > 0 && ni.stats != nil && ni.stats.HAAssets > maxHaAssets {
  5814  			s.Warnf("Peer selection: discard %s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d for stream placement",
  5815  				ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets)
  5816  			err.misc = true
  5817  			continue
  5818  		}
  5819  
  5820  		if uniqueTagPrefix != _EMPTY_ {
  5821  			if unique, owner := checkUniqueTag(&ni); !unique {
  5822  				if owner != nil {
  5823  					s.Debugf("Peer selection: discard %s@%s tags:%v reason: unique prefix %s owned by %s@%s",
  5824  						ni.name, ni.cluster, ni.tags, owner.name, owner.cluster)
  5825  				} else {
  5826  					s.Debugf("Peer selection: discard %s@%s tags:%v reason: unique prefix %s not present",
  5827  						ni.name, ni.cluster, ni.tags)
  5828  				}
  5829  				err.uniqueTag = true
  5830  				continue
  5831  			}
  5832  		}
  5833  		// Add to our list of potential nodes.
  5834  		nodes = append(nodes, wn{p.ID, available, peerHA[p.ID], peerStreams[p.ID]})
  5835  	}
  5836  
  5837  	// If we could not select enough peers, fail.
  5838  	if len(nodes) < (r - len(existing)) {
  5839  		s.Debugf("Peer selection: required %d nodes but found %d (cluster: %s replica: %d existing: %v/%d peers: %d result-peers: %d err: %+v)",
  5840  			(r - len(existing)), len(nodes), cluster, r, existing, replaceFirstExisting, len(peers), len(nodes), err)
  5841  		if len(peers) == 0 {
  5842  			err.noJsClust = true
  5843  		}
  5844  		return nil, &err
  5845  	}
  5846  	// Sort based on available from most to least, breaking ties by number of total streams assigned to the peer.
  5847  	sort.Slice(nodes, func(i, j int) bool {
  5848  		if nodes[i].avail == nodes[j].avail {
  5849  			return nodes[i].ns < nodes[j].ns
  5850  		}
  5851  		return nodes[i].avail > nodes[j].avail
  5852  	})
  5853  	// If we are placing a replicated stream, let's sort based on HAAssets, as that is more important to balance.
  5854  	if cfg.Replicas > 1 {
  5855  		sort.SliceStable(nodes, func(i, j int) bool { return nodes[i].ha < nodes[j].ha })
  5856  	}
  5857  
  5858  	var results []string
  5859  	if len(existing) > 0 {
  5860  		results = append(results, existing...)
  5861  		r -= len(existing)
  5862  	}
  5863  	for _, r := range nodes[:r] {
  5864  		results = append(results, r.id)
  5865  	}
  5866  	return results, nil
  5867  }
  5868  
  5869  func groupNameForStream(peers []string, storage StorageType) string {
  5870  	return groupName("S", peers, storage)
  5871  }
  5872  
  5873  func groupNameForConsumer(peers []string, storage StorageType) string {
  5874  	return groupName("C", peers, storage)
  5875  }
  5876  
  5877  func groupName(prefix string, peers []string, storage StorageType) string {
  5878  	gns := getHash(nuid.Next())
  5879  	return fmt.Sprintf("%s-R%d%s-%s", prefix, len(peers), storage.String()[:1], gns)
  5880  }
  5881  
  5882  // returns stream count for this tier as well as applicable reservation size (not including reservations for cfg)
  5883  // jetStream read lock should be held
  5884  func tieredStreamAndReservationCount(asa map[string]*streamAssignment, tier string, cfg *StreamConfig) (int, int64) {
  5885  	var numStreams int
  5886  	var reservation int64
  5887  	for _, sa := range asa {
  5888  		if tier == _EMPTY_ || isSameTier(sa.Config, cfg) {
  5889  			numStreams++
  5890  			if sa.Config.MaxBytes > 0 && sa.Config.Storage == cfg.Storage && sa.Config.Name != cfg.Name {
  5891  				// If tier is empty, all storage is flat and we should adjust for replicas.
  5892  				// Otherwise if tiered, storage replication already taken into consideration.
  5893  				if tier == _EMPTY_ && cfg.Replicas > 1 {
  5894  					reservation += sa.Config.MaxBytes * int64(cfg.Replicas)
  5895  				} else {
  5896  					reservation += sa.Config.MaxBytes
  5897  				}
  5898  			}
  5899  		}
  5900  	}
  5901  	return numStreams, reservation
  5902  }
  5903  
  5904  // createGroupForStream will create a group for assignment for the stream.
  5905  // Lock should be held.
  5906  func (js *jetStream) createGroupForStream(ci *ClientInfo, cfg *StreamConfig) (*raftGroup, *selectPeerError) {
  5907  	replicas := cfg.Replicas
  5908  	if replicas == 0 {
  5909  		replicas = 1
  5910  	}
  5911  
  5912  	// Default connected cluster from the request origin.
  5913  	cc, cluster := js.cluster, ci.Cluster
  5914  	// If specified, override the default.
  5915  	clusterDefined := cfg.Placement != nil && cfg.Placement.Cluster != _EMPTY_
  5916  	if clusterDefined {
  5917  		cluster = cfg.Placement.Cluster
  5918  	}
  5919  	clusters := []string{cluster}
  5920  	if !clusterDefined {
  5921  		clusters = append(clusters, ci.Alternates...)
  5922  	}
  5923  
  5924  	// Need to create a group here.
  5925  	errs := &selectPeerError{}
  5926  	for _, cn := range clusters {
  5927  		peers, err := cc.selectPeerGroup(replicas, cn, cfg, nil, 0, nil)
  5928  		if len(peers) < replicas {
  5929  			errs.accumulate(err)
  5930  			continue
  5931  		}
  5932  		return &raftGroup{Name: groupNameForStream(peers, cfg.Storage), Storage: cfg.Storage, Peers: peers, Cluster: cn}, nil
  5933  	}
  5934  	return nil, errs
  5935  }
  5936  
  5937  func (acc *Account) selectLimits(cfg *StreamConfig) (*JetStreamAccountLimits, string, *jsAccount, *ApiError) {
  5938  	// Grab our jetstream account info.
  5939  	acc.mu.RLock()
  5940  	jsa := acc.js
  5941  	acc.mu.RUnlock()
  5942  
  5943  	if jsa == nil {
  5944  		return nil, _EMPTY_, nil, NewJSNotEnabledForAccountError()
  5945  	}
  5946  
  5947  	jsa.usageMu.RLock()
  5948  	selectedLimits, tierName, ok := jsa.selectLimits(cfg)
  5949  	jsa.usageMu.RUnlock()
  5950  
  5951  	if !ok {
  5952  		return nil, _EMPTY_, nil, NewJSNoLimitsError()
  5953  	}
  5954  	return &selectedLimits, tierName, jsa, nil
  5955  }
  5956  
  5957  // Read lock needs to be held
  5958  func (js *jetStream) jsClusteredStreamLimitsCheck(acc *Account, cfg *StreamConfig) *ApiError {
  5959  	selectedLimits, tier, _, apiErr := acc.selectLimits(cfg)
  5960  	if apiErr != nil {
  5961  		return apiErr
  5962  	}
  5963  
  5964  	asa := js.cluster.streams[acc.Name]
  5965  	numStreams, reservations := tieredStreamAndReservationCount(asa, tier, cfg)
  5966  	// Check for inflight proposals...
  5967  	if cc := js.cluster; cc != nil && cc.inflight != nil {
  5968  		numStreams += len(cc.inflight[acc.Name])
  5969  	}
  5970  	if selectedLimits.MaxStreams > 0 && numStreams >= selectedLimits.MaxStreams {
  5971  		return NewJSMaximumStreamsLimitError()
  5972  	}
  5973  	// Check for account limits here before proposing.
  5974  	if err := js.checkAccountLimits(selectedLimits, cfg, reservations); err != nil {
  5975  		return NewJSStreamLimitsError(err, Unless(err))
  5976  	}
  5977  	return nil
  5978  }
  5979  
  5980  func (s *Server) jsClusteredStreamRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, config *StreamConfig) {
  5981  	js, cc := s.getJetStreamCluster()
  5982  	if js == nil || cc == nil {
  5983  		return
  5984  	}
  5985  
  5986  	var resp = JSApiStreamCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamCreateResponseType}}
  5987  
  5988  	ccfg, apiErr := s.checkStreamCfg(config, acc)
  5989  	if apiErr != nil {
  5990  		resp.Error = apiErr
  5991  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  5992  		return
  5993  	}
  5994  	cfg := &ccfg
  5995  
  5996  	// Now process the request and proposal.
  5997  	js.mu.Lock()
  5998  	defer js.mu.Unlock()
  5999  
  6000  	var self *streamAssignment
  6001  	var rg *raftGroup
  6002  
  6003  	// Capture if we have existing assignment first.
  6004  	if osa := js.streamAssignment(acc.Name, cfg.Name); osa != nil {
  6005  		if !reflect.DeepEqual(osa.Config, cfg) {
  6006  			resp.Error = NewJSStreamNameExistError()
  6007  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6008  			return
  6009  		}
  6010  		// This is an equal assignment.
  6011  		self, rg = osa, osa.Group
  6012  	}
  6013  
  6014  	if cfg.Sealed {
  6015  		resp.Error = NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration for create can not be sealed"))
  6016  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6017  		return
  6018  	}
  6019  
  6020  	// Check for subject collisions here.
  6021  	if cc.subjectsOverlap(acc.Name, cfg.Subjects, self) {
  6022  		resp.Error = NewJSStreamSubjectOverlapError()
  6023  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6024  		return
  6025  	}
  6026  
  6027  	apiErr = js.jsClusteredStreamLimitsCheck(acc, cfg)
  6028  	// Check for stream limits here before proposing. These need to be tracked from meta layer, not jsa.
  6029  	if apiErr != nil {
  6030  		resp.Error = apiErr
  6031  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6032  		return
  6033  	}
  6034  
  6035  	// Raft group selection and placement.
  6036  	if rg == nil {
  6037  		// Check inflight before proposing in case we have an existing inflight proposal.
  6038  		if cc.inflight == nil {
  6039  			cc.inflight = make(map[string]map[string]*raftGroup)
  6040  		}
  6041  		streams, ok := cc.inflight[acc.Name]
  6042  		if !ok {
  6043  			streams = make(map[string]*raftGroup)
  6044  			cc.inflight[acc.Name] = streams
  6045  		} else if existing, ok := streams[cfg.Name]; ok {
  6046  			// We have existing for same stream. Re-use same group.
  6047  			rg = existing
  6048  		}
  6049  	}
  6050  	// Create a new one here if needed.
  6051  	if rg == nil {
  6052  		nrg, err := js.createGroupForStream(ci, cfg)
  6053  		if err != nil {
  6054  			resp.Error = NewJSClusterNoPeersError(err)
  6055  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6056  			return
  6057  		}
  6058  		rg = nrg
  6059  		// Pick a preferred leader.
  6060  		rg.setPreferred()
  6061  	}
  6062  
  6063  	// Sync subject for post snapshot sync.
  6064  	sa := &streamAssignment{Group: rg, Sync: syncSubjForStream(), Config: cfg, Subject: subject, Reply: reply, Client: ci, Created: time.Now().UTC()}
  6065  	if err := cc.meta.Propose(encodeAddStreamAssignment(sa)); err == nil {
  6066  		// On success, add this as an inflight proposal so we can apply limits
  6067  		// on concurrent create requests while this stream assignment has
  6068  		// possibly not been processed yet.
  6069  		if streams, ok := cc.inflight[acc.Name]; ok {
  6070  			streams[cfg.Name] = rg
  6071  		}
  6072  	}
  6073  }
  6074  
  6075  var (
  6076  	errReqTimeout = errors.New("timeout while waiting for response")
  6077  	errReqSrvExit = errors.New("server shutdown while waiting for response")
  6078  )
  6079  
  6080  // blocking utility call to perform requests on the system account
  6081  // returns (synchronized) v or error
  6082  func sysRequest[T any](s *Server, subjFormat string, args ...any) (*T, error) {
  6083  	isubj := fmt.Sprintf(subjFormat, args...)
  6084  
  6085  	s.mu.Lock()
  6086  	inbox := s.newRespInbox()
  6087  	results := make(chan *T, 1)
  6088  	s.sys.replies[inbox] = func(_ *subscription, _ *client, _ *Account, _, _ string, msg []byte) {
  6089  		var v T
  6090  		if err := json.Unmarshal(msg, &v); err != nil {
  6091  			s.Warnf("Error unmarshalling response for request '%s':%v", isubj, err)
  6092  			return
  6093  		}
  6094  		select {
  6095  		case results <- &v:
  6096  		default:
  6097  			s.Warnf("Failed placing request response on internal channel")
  6098  		}
  6099  	}
  6100  	s.mu.Unlock()
  6101  
  6102  	s.sendInternalMsgLocked(isubj, inbox, nil, nil)
  6103  
  6104  	defer func() {
  6105  		s.mu.Lock()
  6106  		defer s.mu.Unlock()
  6107  		if s.sys != nil && s.sys.replies != nil {
  6108  			delete(s.sys.replies, inbox)
  6109  		}
  6110  	}()
  6111  
  6112  	ttl := time.NewTimer(2 * time.Second)
  6113  	defer ttl.Stop()
  6114  
  6115  	select {
  6116  	case <-s.quitCh:
  6117  		return nil, errReqSrvExit
  6118  	case <-ttl.C:
  6119  		return nil, errReqTimeout
  6120  	case data := <-results:
  6121  		return data, nil
  6122  	}
  6123  }
  6124  
  6125  func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, cfg *StreamConfig, peerSet []string) {
  6126  	js, cc := s.getJetStreamCluster()
  6127  	if js == nil || cc == nil {
  6128  		return
  6129  	}
  6130  
  6131  	// Now process the request and proposal.
  6132  	js.mu.Lock()
  6133  	defer js.mu.Unlock()
  6134  	meta := cc.meta
  6135  	if meta == nil {
  6136  		return
  6137  	}
  6138  
  6139  	var resp = JSApiStreamUpdateResponse{ApiResponse: ApiResponse{Type: JSApiStreamUpdateResponseType}}
  6140  
  6141  	osa := js.streamAssignment(acc.Name, cfg.Name)
  6142  
  6143  	if osa == nil {
  6144  		resp.Error = NewJSStreamNotFoundError()
  6145  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6146  		return
  6147  	}
  6148  	var newCfg *StreamConfig
  6149  	if jsa := js.accounts[acc.Name]; jsa != nil {
  6150  		js.mu.Unlock()
  6151  		ncfg, err := jsa.configUpdateCheck(osa.Config, cfg, s)
  6152  		js.mu.Lock()
  6153  		if err != nil {
  6154  			resp.Error = NewJSStreamUpdateError(err, Unless(err))
  6155  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6156  			return
  6157  		} else {
  6158  			newCfg = ncfg
  6159  		}
  6160  	} else {
  6161  		resp.Error = NewJSNotEnabledForAccountError()
  6162  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6163  		return
  6164  	}
  6165  	// Check for mirror changes which are not allowed.
  6166  	if !reflect.DeepEqual(newCfg.Mirror, osa.Config.Mirror) {
  6167  		resp.Error = NewJSStreamMirrorNotUpdatableError()
  6168  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6169  		return
  6170  	}
  6171  
  6172  	// Check for subject collisions here.
  6173  	if cc.subjectsOverlap(acc.Name, cfg.Subjects, osa) {
  6174  		resp.Error = NewJSStreamSubjectOverlapError()
  6175  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6176  		return
  6177  	}
  6178  
  6179  	// Make copy so to not change original.
  6180  	rg := osa.copyGroup().Group
  6181  
  6182  	// Check for a move request.
  6183  	var isMoveRequest, isMoveCancel bool
  6184  	if lPeerSet := len(peerSet); lPeerSet > 0 {
  6185  		isMoveRequest = true
  6186  		// check if this is a cancellation
  6187  		if lPeerSet == osa.Config.Replicas && lPeerSet <= len(rg.Peers) {
  6188  			isMoveCancel = true
  6189  			// can only be a cancellation if the peer sets overlap as expected
  6190  			for i := 0; i < lPeerSet; i++ {
  6191  				if peerSet[i] != rg.Peers[i] {
  6192  					isMoveCancel = false
  6193  					break
  6194  				}
  6195  			}
  6196  		}
  6197  	} else {
  6198  		isMoveRequest = newCfg.Placement != nil && !reflect.DeepEqual(osa.Config.Placement, newCfg.Placement)
  6199  	}
  6200  
  6201  	// Check for replica changes.
  6202  	isReplicaChange := newCfg.Replicas != osa.Config.Replicas
  6203  
  6204  	// We stage consumer updates and do them after the stream update.
  6205  	var consumers []*consumerAssignment
  6206  
  6207  	// Check if this is a move request, but no cancellation, and we are already moving this stream.
  6208  	if isMoveRequest && !isMoveCancel && osa.Config.Replicas != len(rg.Peers) {
  6209  		// obtain stats to include in error message
  6210  		msg := _EMPTY_
  6211  		if s.allPeersOffline(rg) {
  6212  			msg = fmt.Sprintf("all %d peers offline", len(rg.Peers))
  6213  		} else {
  6214  			// Need to release js lock.
  6215  			js.mu.Unlock()
  6216  			if si, err := sysRequest[StreamInfo](s, clusterStreamInfoT, ci.serviceAccount(), cfg.Name); err != nil {
  6217  				msg = fmt.Sprintf("error retrieving info: %s", err.Error())
  6218  			} else if si != nil {
  6219  				currentCount := 0
  6220  				if si.Cluster.Leader != _EMPTY_ {
  6221  					currentCount++
  6222  				}
  6223  				combinedLag := uint64(0)
  6224  				for _, r := range si.Cluster.Replicas {
  6225  					if r.Current {
  6226  						currentCount++
  6227  					}
  6228  					combinedLag += r.Lag
  6229  				}
  6230  				msg = fmt.Sprintf("total peers: %d, current peers: %d, combined lag: %d",
  6231  					len(rg.Peers), currentCount, combinedLag)
  6232  			}
  6233  			// Re-acquire here.
  6234  			js.mu.Lock()
  6235  		}
  6236  		resp.Error = NewJSStreamMoveInProgressError(msg)
  6237  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6238  		return
  6239  	}
  6240  
  6241  	// Can not move and scale at same time.
  6242  	if isMoveRequest && isReplicaChange {
  6243  		resp.Error = NewJSStreamMoveAndScaleError()
  6244  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6245  		return
  6246  	}
  6247  
  6248  	if isReplicaChange {
  6249  		// We are adding new peers here.
  6250  		if newCfg.Replicas > len(rg.Peers) {
  6251  			// Check that we have the allocation available.
  6252  			if err := js.jsClusteredStreamLimitsCheck(acc, newCfg); err != nil {
  6253  				resp.Error = err
  6254  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6255  				return
  6256  			}
  6257  			// Check if we do not have a cluster assigned, and if we do not make sure we
  6258  			// try to pick one. This could happen with older streams that were assigned by
  6259  			// previous servers.
  6260  			if rg.Cluster == _EMPTY_ {
  6261  				// Prefer placement directrives if we have them.
  6262  				if newCfg.Placement != nil && newCfg.Placement.Cluster != _EMPTY_ {
  6263  					rg.Cluster = newCfg.Placement.Cluster
  6264  				} else {
  6265  					// Fall back to the cluster assignment from the client.
  6266  					rg.Cluster = ci.Cluster
  6267  				}
  6268  			}
  6269  			peers, err := cc.selectPeerGroup(newCfg.Replicas, rg.Cluster, newCfg, rg.Peers, 0, nil)
  6270  			if err != nil {
  6271  				resp.Error = NewJSClusterNoPeersError(err)
  6272  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6273  				return
  6274  			}
  6275  			// Single nodes are not recorded by the NRG layer so we can rename.
  6276  			if len(peers) == 1 {
  6277  				rg.Name = groupNameForStream(peers, rg.Storage)
  6278  			} else if len(rg.Peers) == 1 {
  6279  				// This is scale up from being a singelton, set preferred to that singelton.
  6280  				rg.Preferred = rg.Peers[0]
  6281  			}
  6282  			rg.Peers = peers
  6283  		} else {
  6284  			// We are deleting nodes here. We want to do our best to preserve the current leader.
  6285  			// We have support now from above that guarantees we are in our own Go routine, so can
  6286  			// ask for stream info from the stream leader to make sure we keep the leader in the new list.
  6287  			var curLeader string
  6288  			if !s.allPeersOffline(rg) {
  6289  				// Need to release js lock.
  6290  				js.mu.Unlock()
  6291  				if si, err := sysRequest[StreamInfo](s, clusterStreamInfoT, ci.serviceAccount(), cfg.Name); err != nil {
  6292  					s.Warnf("Did not receive stream info results for '%s > %s' due to: %s", acc, cfg.Name, err)
  6293  				} else if si != nil {
  6294  					if cl := si.Cluster; cl != nil && cl.Leader != _EMPTY_ {
  6295  						curLeader = getHash(cl.Leader)
  6296  					}
  6297  				}
  6298  				// Re-acquire here.
  6299  				js.mu.Lock()
  6300  			}
  6301  			// If we identified a leader make sure its part of the new group.
  6302  			selected := make([]string, 0, newCfg.Replicas)
  6303  
  6304  			if curLeader != _EMPTY_ {
  6305  				selected = append(selected, curLeader)
  6306  			}
  6307  			for _, peer := range rg.Peers {
  6308  				if len(selected) == newCfg.Replicas {
  6309  					break
  6310  				}
  6311  				if peer == curLeader {
  6312  					continue
  6313  				}
  6314  				if si, ok := s.nodeToInfo.Load(peer); ok && si != nil {
  6315  					if si.(nodeInfo).offline {
  6316  						continue
  6317  					}
  6318  					selected = append(selected, peer)
  6319  				}
  6320  			}
  6321  			rg.Peers = selected
  6322  		}
  6323  
  6324  		// Need to remap any consumers.
  6325  		for _, ca := range osa.consumers {
  6326  			// Ephemerals are R=1, so only auto-remap durables, or R>1, unless stream is interest or workqueue policy.
  6327  			numPeers := len(ca.Group.Peers)
  6328  			if ca.Config.Durable != _EMPTY_ || numPeers > 1 || cfg.Retention != LimitsPolicy {
  6329  				cca := ca.copyGroup()
  6330  				// Adjust preferred as needed.
  6331  				if numPeers == 1 && len(rg.Peers) > 1 {
  6332  					cca.Group.Preferred = ca.Group.Peers[0]
  6333  				} else {
  6334  					cca.Group.Preferred = _EMPTY_
  6335  				}
  6336  				// Assign new peers.
  6337  				cca.Group.Peers = rg.Peers
  6338  				// We can not propose here before the stream itself so we collect them.
  6339  				consumers = append(consumers, cca)
  6340  			}
  6341  		}
  6342  	} else if isMoveRequest {
  6343  		if len(peerSet) == 0 {
  6344  			nrg, err := js.createGroupForStream(ci, newCfg)
  6345  			if err != nil {
  6346  				resp.Error = NewJSClusterNoPeersError(err)
  6347  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6348  				return
  6349  			}
  6350  			// filter peers present in both sets
  6351  			for _, peer := range rg.Peers {
  6352  				found := false
  6353  				for _, newPeer := range nrg.Peers {
  6354  					if peer == newPeer {
  6355  						found = true
  6356  						break
  6357  					}
  6358  				}
  6359  				if !found {
  6360  					peerSet = append(peerSet, peer)
  6361  				}
  6362  			}
  6363  			peerSet = append(peerSet, nrg.Peers...)
  6364  		}
  6365  		if len(rg.Peers) == 1 {
  6366  			rg.Preferred = peerSet[0]
  6367  		}
  6368  		rg.Peers = peerSet
  6369  
  6370  		for _, ca := range osa.consumers {
  6371  			cca := ca.copyGroup()
  6372  			r := cca.Config.replicas(osa.Config)
  6373  			// shuffle part of cluster peer set we will be keeping
  6374  			randPeerSet := copyStrings(peerSet[len(peerSet)-newCfg.Replicas:])
  6375  			rand.Shuffle(newCfg.Replicas, func(i, j int) { randPeerSet[i], randPeerSet[j] = randPeerSet[j], randPeerSet[i] })
  6376  			// move overlapping peers at the end of randPeerSet and keep a tally of non overlapping peers
  6377  			dropPeerSet := make([]string, 0, len(cca.Group.Peers))
  6378  			for _, p := range cca.Group.Peers {
  6379  				found := false
  6380  				for i, rp := range randPeerSet {
  6381  					if p == rp {
  6382  						randPeerSet[i] = randPeerSet[newCfg.Replicas-1]
  6383  						randPeerSet[newCfg.Replicas-1] = p
  6384  						found = true
  6385  						break
  6386  					}
  6387  				}
  6388  				if !found {
  6389  					dropPeerSet = append(dropPeerSet, p)
  6390  				}
  6391  			}
  6392  			cPeerSet := randPeerSet[newCfg.Replicas-r:]
  6393  			// In case of a set or cancel simply assign
  6394  			if len(peerSet) == newCfg.Replicas {
  6395  				cca.Group.Peers = cPeerSet
  6396  			} else {
  6397  				cca.Group.Peers = append(dropPeerSet, cPeerSet...)
  6398  			}
  6399  			// make sure it overlaps with peers and remove if not
  6400  			if cca.Group.Preferred != _EMPTY_ {
  6401  				found := false
  6402  				for _, p := range cca.Group.Peers {
  6403  					if p == cca.Group.Preferred {
  6404  						found = true
  6405  						break
  6406  					}
  6407  				}
  6408  				if !found {
  6409  					cca.Group.Preferred = _EMPTY_
  6410  				}
  6411  			}
  6412  			// We can not propose here before the stream itself so we collect them.
  6413  			consumers = append(consumers, cca)
  6414  		}
  6415  	} else {
  6416  		// All other updates make sure no preferred is set.
  6417  		rg.Preferred = _EMPTY_
  6418  	}
  6419  
  6420  	sa := &streamAssignment{Group: rg, Sync: osa.Sync, Created: osa.Created, Config: newCfg, Subject: subject, Reply: reply, Client: ci}
  6421  	meta.Propose(encodeUpdateStreamAssignment(sa))
  6422  
  6423  	// Process any staged consumers.
  6424  	for _, ca := range consumers {
  6425  		meta.Propose(encodeAddConsumerAssignment(ca))
  6426  	}
  6427  }
  6428  
  6429  func (s *Server) jsClusteredStreamDeleteRequest(ci *ClientInfo, acc *Account, stream, subject, reply string, rmsg []byte) {
  6430  	js, cc := s.getJetStreamCluster()
  6431  	if js == nil || cc == nil {
  6432  		return
  6433  	}
  6434  
  6435  	js.mu.Lock()
  6436  	defer js.mu.Unlock()
  6437  
  6438  	if cc.meta == nil {
  6439  		return
  6440  	}
  6441  
  6442  	osa := js.streamAssignment(acc.Name, stream)
  6443  	if osa == nil {
  6444  		var resp = JSApiStreamDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamDeleteResponseType}}
  6445  		resp.Error = NewJSStreamNotFoundError()
  6446  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6447  		return
  6448  	}
  6449  
  6450  	sa := &streamAssignment{Group: osa.Group, Config: osa.Config, Subject: subject, Reply: reply, Client: ci}
  6451  	cc.meta.Propose(encodeDeleteStreamAssignment(sa))
  6452  }
  6453  
  6454  // Process a clustered purge request.
  6455  func (s *Server) jsClusteredStreamPurgeRequest(
  6456  	ci *ClientInfo,
  6457  	acc *Account,
  6458  	mset *stream,
  6459  	stream, subject, reply string,
  6460  	rmsg []byte,
  6461  	preq *JSApiStreamPurgeRequest,
  6462  ) {
  6463  	js, cc := s.getJetStreamCluster()
  6464  	if js == nil || cc == nil {
  6465  		return
  6466  	}
  6467  
  6468  	js.mu.Lock()
  6469  	sa := js.streamAssignment(acc.Name, stream)
  6470  	if sa == nil {
  6471  		resp := JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}}
  6472  		resp.Error = NewJSStreamNotFoundError()
  6473  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6474  		js.mu.Unlock()
  6475  		return
  6476  	}
  6477  
  6478  	if n := sa.Group.node; n != nil {
  6479  		sp := &streamPurge{Stream: stream, LastSeq: mset.state().LastSeq, Subject: subject, Reply: reply, Client: ci, Request: preq}
  6480  		n.Propose(encodeStreamPurge(sp))
  6481  		js.mu.Unlock()
  6482  		return
  6483  	}
  6484  	js.mu.Unlock()
  6485  
  6486  	if mset == nil {
  6487  		return
  6488  	}
  6489  
  6490  	var resp = JSApiStreamPurgeResponse{ApiResponse: ApiResponse{Type: JSApiStreamPurgeResponseType}}
  6491  	purged, err := mset.purge(preq)
  6492  	if err != nil {
  6493  		resp.Error = NewJSStreamGeneralError(err, Unless(err))
  6494  	} else {
  6495  		resp.Purged = purged
  6496  		resp.Success = true
  6497  	}
  6498  	s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6499  }
  6500  
  6501  func (s *Server) jsClusteredStreamRestoreRequest(
  6502  	ci *ClientInfo,
  6503  	acc *Account,
  6504  	req *JSApiStreamRestoreRequest,
  6505  	subject, reply string, rmsg []byte) {
  6506  
  6507  	js, cc := s.getJetStreamCluster()
  6508  	if js == nil || cc == nil {
  6509  		return
  6510  	}
  6511  
  6512  	js.mu.Lock()
  6513  	defer js.mu.Unlock()
  6514  
  6515  	if cc.meta == nil {
  6516  		return
  6517  	}
  6518  
  6519  	cfg := &req.Config
  6520  	resp := JSApiStreamRestoreResponse{ApiResponse: ApiResponse{Type: JSApiStreamRestoreResponseType}}
  6521  
  6522  	if err := js.jsClusteredStreamLimitsCheck(acc, cfg); err != nil {
  6523  		resp.Error = err
  6524  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6525  		return
  6526  	}
  6527  
  6528  	if sa := js.streamAssignment(ci.serviceAccount(), cfg.Name); sa != nil {
  6529  		resp.Error = NewJSStreamNameExistRestoreFailedError()
  6530  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6531  		return
  6532  	}
  6533  
  6534  	// Raft group selection and placement.
  6535  	rg, err := js.createGroupForStream(ci, cfg)
  6536  	if err != nil {
  6537  		resp.Error = NewJSClusterNoPeersError(err)
  6538  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6539  		return
  6540  	}
  6541  	// Pick a preferred leader.
  6542  	rg.setPreferred()
  6543  	sa := &streamAssignment{Group: rg, Sync: syncSubjForStream(), Config: cfg, Subject: subject, Reply: reply, Client: ci, Created: time.Now().UTC()}
  6544  	// Now add in our restore state and pre-select a peer to handle the actual receipt of the snapshot.
  6545  	sa.Restore = &req.State
  6546  	cc.meta.Propose(encodeAddStreamAssignment(sa))
  6547  }
  6548  
  6549  // Determine if all peers for this group are offline.
  6550  func (s *Server) allPeersOffline(rg *raftGroup) bool {
  6551  	if rg == nil {
  6552  		return false
  6553  	}
  6554  	// Check to see if this stream has any servers online to respond.
  6555  	for _, peer := range rg.Peers {
  6556  		if si, ok := s.nodeToInfo.Load(peer); ok && si != nil {
  6557  			if !si.(nodeInfo).offline {
  6558  				return false
  6559  			}
  6560  		}
  6561  	}
  6562  	return true
  6563  }
  6564  
  6565  // This will do a scatter and gather operation for all streams for this account. This is only called from metadata leader.
  6566  // This will be running in a separate Go routine.
  6567  func (s *Server) jsClusteredStreamListRequest(acc *Account, ci *ClientInfo, filter string, offset int, subject, reply string, rmsg []byte) {
  6568  	defer s.grWG.Done()
  6569  
  6570  	js, cc := s.getJetStreamCluster()
  6571  	if js == nil || cc == nil {
  6572  		return
  6573  	}
  6574  
  6575  	js.mu.RLock()
  6576  
  6577  	var streams []*streamAssignment
  6578  	for _, sa := range cc.streams[acc.Name] {
  6579  		if IsNatsErr(sa.err, JSClusterNotAssignedErr) {
  6580  			continue
  6581  		}
  6582  
  6583  		if filter != _EMPTY_ {
  6584  			// These could not have subjects auto-filled in since they are raw and unprocessed.
  6585  			if len(sa.Config.Subjects) == 0 {
  6586  				if SubjectsCollide(filter, sa.Config.Name) {
  6587  					streams = append(streams, sa)
  6588  				}
  6589  			} else {
  6590  				for _, subj := range sa.Config.Subjects {
  6591  					if SubjectsCollide(filter, subj) {
  6592  						streams = append(streams, sa)
  6593  						break
  6594  					}
  6595  				}
  6596  			}
  6597  		} else {
  6598  			streams = append(streams, sa)
  6599  		}
  6600  	}
  6601  
  6602  	// Needs to be sorted for offsets etc.
  6603  	if len(streams) > 1 {
  6604  		sort.Slice(streams, func(i, j int) bool {
  6605  			return strings.Compare(streams[i].Config.Name, streams[j].Config.Name) < 0
  6606  		})
  6607  	}
  6608  
  6609  	scnt := len(streams)
  6610  	if offset > scnt {
  6611  		offset = scnt
  6612  	}
  6613  	if offset > 0 {
  6614  		streams = streams[offset:]
  6615  	}
  6616  	if len(streams) > JSApiListLimit {
  6617  		streams = streams[:JSApiListLimit]
  6618  	}
  6619  
  6620  	var resp = JSApiStreamListResponse{
  6621  		ApiResponse: ApiResponse{Type: JSApiStreamListResponseType},
  6622  		Streams:     make([]*StreamInfo, 0, len(streams)),
  6623  	}
  6624  
  6625  	js.mu.RUnlock()
  6626  
  6627  	if len(streams) == 0 {
  6628  		resp.Limit = JSApiListLimit
  6629  		resp.Offset = offset
  6630  		s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6631  		return
  6632  	}
  6633  
  6634  	// Create an inbox for our responses and send out our requests.
  6635  	s.mu.Lock()
  6636  	inbox := s.newRespInbox()
  6637  	rc := make(chan *StreamInfo, len(streams))
  6638  
  6639  	// Store our handler.
  6640  	s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) {
  6641  		var si StreamInfo
  6642  		if err := json.Unmarshal(msg, &si); err != nil {
  6643  			s.Warnf("Error unmarshalling clustered stream info response:%v", err)
  6644  			return
  6645  		}
  6646  		select {
  6647  		case rc <- &si:
  6648  		default:
  6649  			s.Warnf("Failed placing remote stream info result on internal channel")
  6650  		}
  6651  	}
  6652  	s.mu.Unlock()
  6653  
  6654  	// Cleanup after.
  6655  	defer func() {
  6656  		s.mu.Lock()
  6657  		if s.sys != nil && s.sys.replies != nil {
  6658  			delete(s.sys.replies, inbox)
  6659  		}
  6660  		s.mu.Unlock()
  6661  	}()
  6662  
  6663  	var missingNames []string
  6664  	sent := map[string]int{}
  6665  
  6666  	// Send out our requests here.
  6667  	js.mu.RLock()
  6668  	for _, sa := range streams {
  6669  		if s.allPeersOffline(sa.Group) {
  6670  			// Place offline onto our results by hand here.
  6671  			si := &StreamInfo{
  6672  				Config:    *sa.Config,
  6673  				Created:   sa.Created,
  6674  				Cluster:   js.offlineClusterInfo(sa.Group),
  6675  				TimeStamp: time.Now().UTC(),
  6676  			}
  6677  			resp.Streams = append(resp.Streams, si)
  6678  			missingNames = append(missingNames, sa.Config.Name)
  6679  		} else {
  6680  			isubj := fmt.Sprintf(clusterStreamInfoT, sa.Client.serviceAccount(), sa.Config.Name)
  6681  			s.sendInternalMsgLocked(isubj, inbox, nil, nil)
  6682  			sent[sa.Config.Name] = len(sa.consumers)
  6683  		}
  6684  	}
  6685  	// Don't hold lock.
  6686  	js.mu.RUnlock()
  6687  
  6688  	const timeout = 4 * time.Second
  6689  	notActive := time.NewTimer(timeout)
  6690  	defer notActive.Stop()
  6691  
  6692  LOOP:
  6693  	for len(sent) > 0 {
  6694  		select {
  6695  		case <-s.quitCh:
  6696  			return
  6697  		case <-notActive.C:
  6698  			s.Warnf("Did not receive all stream info results for %q", acc)
  6699  			for sName := range sent {
  6700  				missingNames = append(missingNames, sName)
  6701  			}
  6702  			break LOOP
  6703  		case si := <-rc:
  6704  			consCount := sent[si.Config.Name]
  6705  			if consCount > 0 {
  6706  				si.State.Consumers = consCount
  6707  			}
  6708  			delete(sent, si.Config.Name)
  6709  			resp.Streams = append(resp.Streams, si)
  6710  			// Check to see if we are done.
  6711  			if len(resp.Streams) == len(streams) {
  6712  				break LOOP
  6713  			}
  6714  		}
  6715  	}
  6716  
  6717  	// Needs to be sorted as well.
  6718  	if len(resp.Streams) > 1 {
  6719  		sort.Slice(resp.Streams, func(i, j int) bool {
  6720  			return strings.Compare(resp.Streams[i].Config.Name, resp.Streams[j].Config.Name) < 0
  6721  		})
  6722  	}
  6723  
  6724  	resp.Total = scnt
  6725  	resp.Limit = JSApiListLimit
  6726  	resp.Offset = offset
  6727  	resp.Missing = missingNames
  6728  	s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6729  }
  6730  
  6731  // This will do a scatter and gather operation for all consumers for this stream and account.
  6732  // This will be running in a separate Go routine.
  6733  func (s *Server) jsClusteredConsumerListRequest(acc *Account, ci *ClientInfo, offset int, stream, subject, reply string, rmsg []byte) {
  6734  	defer s.grWG.Done()
  6735  
  6736  	js, cc := s.getJetStreamCluster()
  6737  	if js == nil || cc == nil {
  6738  		return
  6739  	}
  6740  
  6741  	js.mu.RLock()
  6742  
  6743  	var consumers []*consumerAssignment
  6744  	if sas := cc.streams[acc.Name]; sas != nil {
  6745  		if sa := sas[stream]; sa != nil {
  6746  			// Copy over since we need to sort etc.
  6747  			for _, ca := range sa.consumers {
  6748  				consumers = append(consumers, ca)
  6749  			}
  6750  		}
  6751  	}
  6752  	// Needs to be sorted.
  6753  	if len(consumers) > 1 {
  6754  		sort.Slice(consumers, func(i, j int) bool {
  6755  			return strings.Compare(consumers[i].Name, consumers[j].Name) < 0
  6756  		})
  6757  	}
  6758  
  6759  	ocnt := len(consumers)
  6760  	if offset > ocnt {
  6761  		offset = ocnt
  6762  	}
  6763  	if offset > 0 {
  6764  		consumers = consumers[offset:]
  6765  	}
  6766  	if len(consumers) > JSApiListLimit {
  6767  		consumers = consumers[:JSApiListLimit]
  6768  	}
  6769  
  6770  	// Send out our requests here.
  6771  	var resp = JSApiConsumerListResponse{
  6772  		ApiResponse: ApiResponse{Type: JSApiConsumerListResponseType},
  6773  		Consumers:   []*ConsumerInfo{},
  6774  	}
  6775  
  6776  	js.mu.RUnlock()
  6777  
  6778  	if len(consumers) == 0 {
  6779  		resp.Limit = JSApiListLimit
  6780  		resp.Offset = offset
  6781  		s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6782  		return
  6783  	}
  6784  
  6785  	// Create an inbox for our responses and send out requests.
  6786  	s.mu.Lock()
  6787  	inbox := s.newRespInbox()
  6788  	rc := make(chan *ConsumerInfo, len(consumers))
  6789  
  6790  	// Store our handler.
  6791  	s.sys.replies[inbox] = func(sub *subscription, _ *client, _ *Account, subject, _ string, msg []byte) {
  6792  		var ci ConsumerInfo
  6793  		if err := json.Unmarshal(msg, &ci); err != nil {
  6794  			s.Warnf("Error unmarshaling clustered consumer info response:%v", err)
  6795  			return
  6796  		}
  6797  		select {
  6798  		case rc <- &ci:
  6799  		default:
  6800  			s.Warnf("Failed placing consumer info result on internal chan")
  6801  		}
  6802  	}
  6803  	s.mu.Unlock()
  6804  
  6805  	// Cleanup after.
  6806  	defer func() {
  6807  		s.mu.Lock()
  6808  		if s.sys != nil && s.sys.replies != nil {
  6809  			delete(s.sys.replies, inbox)
  6810  		}
  6811  		s.mu.Unlock()
  6812  	}()
  6813  
  6814  	var missingNames []string
  6815  	sent := map[string]struct{}{}
  6816  
  6817  	// Send out our requests here.
  6818  	js.mu.RLock()
  6819  	for _, ca := range consumers {
  6820  		if s.allPeersOffline(ca.Group) {
  6821  			// Place offline onto our results by hand here.
  6822  			ci := &ConsumerInfo{
  6823  				Config:    ca.Config,
  6824  				Created:   ca.Created,
  6825  				Cluster:   js.offlineClusterInfo(ca.Group),
  6826  				TimeStamp: time.Now().UTC(),
  6827  			}
  6828  			resp.Consumers = append(resp.Consumers, ci)
  6829  			missingNames = append(missingNames, ca.Name)
  6830  		} else {
  6831  			isubj := fmt.Sprintf(clusterConsumerInfoT, ca.Client.serviceAccount(), stream, ca.Name)
  6832  			s.sendInternalMsgLocked(isubj, inbox, nil, nil)
  6833  			sent[ca.Name] = struct{}{}
  6834  		}
  6835  	}
  6836  	// Don't hold lock.
  6837  	js.mu.RUnlock()
  6838  
  6839  	const timeout = 4 * time.Second
  6840  	notActive := time.NewTimer(timeout)
  6841  	defer notActive.Stop()
  6842  
  6843  LOOP:
  6844  	for len(sent) > 0 {
  6845  		select {
  6846  		case <-s.quitCh:
  6847  			return
  6848  		case <-notActive.C:
  6849  			s.Warnf("Did not receive all consumer info results for '%s > %s'", acc, stream)
  6850  			for cName := range sent {
  6851  				missingNames = append(missingNames, cName)
  6852  			}
  6853  			break LOOP
  6854  		case ci := <-rc:
  6855  			delete(sent, ci.Name)
  6856  			resp.Consumers = append(resp.Consumers, ci)
  6857  			// Check to see if we are done.
  6858  			if len(resp.Consumers) == len(consumers) {
  6859  				break LOOP
  6860  			}
  6861  		}
  6862  	}
  6863  
  6864  	// Needs to be sorted as well.
  6865  	if len(resp.Consumers) > 1 {
  6866  		sort.Slice(resp.Consumers, func(i, j int) bool {
  6867  			return strings.Compare(resp.Consumers[i].Name, resp.Consumers[j].Name) < 0
  6868  		})
  6869  	}
  6870  
  6871  	resp.Total = ocnt
  6872  	resp.Limit = JSApiListLimit
  6873  	resp.Offset = offset
  6874  	resp.Missing = missingNames
  6875  	s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6876  }
  6877  
  6878  func encodeStreamPurge(sp *streamPurge) []byte {
  6879  	var bb bytes.Buffer
  6880  	bb.WriteByte(byte(purgeStreamOp))
  6881  	json.NewEncoder(&bb).Encode(sp)
  6882  	return bb.Bytes()
  6883  }
  6884  
  6885  func decodeStreamPurge(buf []byte) (*streamPurge, error) {
  6886  	var sp streamPurge
  6887  	err := json.Unmarshal(buf, &sp)
  6888  	return &sp, err
  6889  }
  6890  
  6891  func (s *Server) jsClusteredConsumerDeleteRequest(ci *ClientInfo, acc *Account, stream, consumer, subject, reply string, rmsg []byte) {
  6892  	js, cc := s.getJetStreamCluster()
  6893  	if js == nil || cc == nil {
  6894  		return
  6895  	}
  6896  
  6897  	js.mu.Lock()
  6898  	defer js.mu.Unlock()
  6899  
  6900  	if cc.meta == nil {
  6901  		return
  6902  	}
  6903  
  6904  	var resp = JSApiConsumerDeleteResponse{ApiResponse: ApiResponse{Type: JSApiConsumerDeleteResponseType}}
  6905  
  6906  	sa := js.streamAssignment(acc.Name, stream)
  6907  	if sa == nil {
  6908  		resp.Error = NewJSStreamNotFoundError()
  6909  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6910  		return
  6911  
  6912  	}
  6913  	if sa.consumers == nil {
  6914  		resp.Error = NewJSConsumerNotFoundError()
  6915  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6916  		return
  6917  	}
  6918  	oca := sa.consumers[consumer]
  6919  	if oca == nil {
  6920  		resp.Error = NewJSConsumerNotFoundError()
  6921  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  6922  		return
  6923  	}
  6924  	oca.deleted = true
  6925  	ca := &consumerAssignment{Group: oca.Group, Stream: stream, Name: consumer, Config: oca.Config, Subject: subject, Reply: reply, Client: ci}
  6926  	cc.meta.Propose(encodeDeleteConsumerAssignment(ca))
  6927  }
  6928  
  6929  func encodeMsgDelete(md *streamMsgDelete) []byte {
  6930  	var bb bytes.Buffer
  6931  	bb.WriteByte(byte(deleteMsgOp))
  6932  	json.NewEncoder(&bb).Encode(md)
  6933  	return bb.Bytes()
  6934  }
  6935  
  6936  func decodeMsgDelete(buf []byte) (*streamMsgDelete, error) {
  6937  	var md streamMsgDelete
  6938  	err := json.Unmarshal(buf, &md)
  6939  	return &md, err
  6940  }
  6941  
  6942  func (s *Server) jsClusteredMsgDeleteRequest(ci *ClientInfo, acc *Account, mset *stream, stream, subject, reply string, req *JSApiMsgDeleteRequest, rmsg []byte) {
  6943  	js, cc := s.getJetStreamCluster()
  6944  	if js == nil || cc == nil {
  6945  		return
  6946  	}
  6947  
  6948  	js.mu.Lock()
  6949  	sa := js.streamAssignment(acc.Name, stream)
  6950  	if sa == nil {
  6951  		s.Debugf("Message delete failed, could not locate stream '%s > %s'", acc.Name, stream)
  6952  		js.mu.Unlock()
  6953  		return
  6954  	}
  6955  
  6956  	// Check for single replica items.
  6957  	if n := sa.Group.node; n != nil {
  6958  		md := streamMsgDelete{Seq: req.Seq, NoErase: req.NoErase, Stream: stream, Subject: subject, Reply: reply, Client: ci}
  6959  		n.Propose(encodeMsgDelete(&md))
  6960  		js.mu.Unlock()
  6961  		return
  6962  	}
  6963  	js.mu.Unlock()
  6964  
  6965  	if mset == nil {
  6966  		return
  6967  	}
  6968  
  6969  	var err error
  6970  	var removed bool
  6971  	if req.NoErase {
  6972  		removed, err = mset.removeMsg(req.Seq)
  6973  	} else {
  6974  		removed, err = mset.eraseMsg(req.Seq)
  6975  	}
  6976  	var resp = JSApiMsgDeleteResponse{ApiResponse: ApiResponse{Type: JSApiMsgDeleteResponseType}}
  6977  	if err != nil {
  6978  		resp.Error = NewJSStreamMsgDeleteFailedError(err, Unless(err))
  6979  	} else if !removed {
  6980  		resp.Error = NewJSSequenceNotFoundError(req.Seq)
  6981  	} else {
  6982  		resp.Success = true
  6983  	}
  6984  	s.sendAPIResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(resp))
  6985  }
  6986  
  6987  func encodeAddStreamAssignment(sa *streamAssignment) []byte {
  6988  	var bb bytes.Buffer
  6989  	bb.WriteByte(byte(assignStreamOp))
  6990  	json.NewEncoder(&bb).Encode(sa)
  6991  	return bb.Bytes()
  6992  }
  6993  
  6994  func encodeUpdateStreamAssignment(sa *streamAssignment) []byte {
  6995  	var bb bytes.Buffer
  6996  	bb.WriteByte(byte(updateStreamOp))
  6997  	json.NewEncoder(&bb).Encode(sa)
  6998  	return bb.Bytes()
  6999  }
  7000  
  7001  func encodeDeleteStreamAssignment(sa *streamAssignment) []byte {
  7002  	var bb bytes.Buffer
  7003  	bb.WriteByte(byte(removeStreamOp))
  7004  	json.NewEncoder(&bb).Encode(sa)
  7005  	return bb.Bytes()
  7006  }
  7007  
  7008  func decodeStreamAssignment(buf []byte) (*streamAssignment, error) {
  7009  	var sa streamAssignment
  7010  	err := json.Unmarshal(buf, &sa)
  7011  	if err != nil {
  7012  		return nil, err
  7013  	}
  7014  	fixCfgMirrorWithDedupWindow(sa.Config)
  7015  	return &sa, err
  7016  }
  7017  
  7018  func encodeDeleteRange(dr *DeleteRange) []byte {
  7019  	var bb bytes.Buffer
  7020  	bb.WriteByte(byte(deleteRangeOp))
  7021  	json.NewEncoder(&bb).Encode(dr)
  7022  	return bb.Bytes()
  7023  }
  7024  
  7025  func decodeDeleteRange(buf []byte) (*DeleteRange, error) {
  7026  	var dr DeleteRange
  7027  	err := json.Unmarshal(buf, &dr)
  7028  	if err != nil {
  7029  		return nil, err
  7030  	}
  7031  	return &dr, err
  7032  }
  7033  
  7034  // createGroupForConsumer will create a new group from same peer set as the stream.
  7035  func (cc *jetStreamCluster) createGroupForConsumer(cfg *ConsumerConfig, sa *streamAssignment) *raftGroup {
  7036  	if len(sa.Group.Peers) == 0 || cfg.Replicas > len(sa.Group.Peers) {
  7037  		return nil
  7038  	}
  7039  
  7040  	peers := copyStrings(sa.Group.Peers)
  7041  	var _ss [5]string
  7042  	active := _ss[:0]
  7043  
  7044  	// Calculate all active peers.
  7045  	for _, peer := range peers {
  7046  		if sir, ok := cc.s.nodeToInfo.Load(peer); ok && sir != nil {
  7047  			if !sir.(nodeInfo).offline {
  7048  				active = append(active, peer)
  7049  			}
  7050  		}
  7051  	}
  7052  	if quorum := cfg.Replicas/2 + 1; quorum > len(active) {
  7053  		// Not enough active to satisfy the request.
  7054  		return nil
  7055  	}
  7056  
  7057  	// If we want less then our parent stream, select from active.
  7058  	if cfg.Replicas > 0 && cfg.Replicas < len(peers) {
  7059  		// Pedantic in case stream is say R5 and consumer is R3 and 3 or more offline, etc.
  7060  		if len(active) < cfg.Replicas {
  7061  			return nil
  7062  		}
  7063  		// First shuffle the active peers and then select to account for replica = 1.
  7064  		rand.Shuffle(len(active), func(i, j int) { active[i], active[j] = active[j], active[i] })
  7065  		peers = active[:cfg.Replicas]
  7066  	}
  7067  	storage := sa.Config.Storage
  7068  	if cfg.MemoryStorage {
  7069  		storage = MemoryStorage
  7070  	}
  7071  	return &raftGroup{Name: groupNameForConsumer(peers, storage), Storage: storage, Peers: peers}
  7072  }
  7073  
  7074  // jsClusteredConsumerRequest is first point of entry to create a consumer in clustered mode.
  7075  func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subject, reply string, rmsg []byte, stream string, cfg *ConsumerConfig, action ConsumerAction) {
  7076  	js, cc := s.getJetStreamCluster()
  7077  	if js == nil || cc == nil {
  7078  		return
  7079  	}
  7080  
  7081  	var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}}
  7082  
  7083  	streamCfg, ok := js.clusterStreamConfig(acc.Name, stream)
  7084  	if !ok {
  7085  		resp.Error = NewJSStreamNotFoundError()
  7086  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7087  		return
  7088  	}
  7089  	selectedLimits, _, _, apiErr := acc.selectLimits(&streamCfg)
  7090  	if apiErr != nil {
  7091  		resp.Error = apiErr
  7092  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7093  		return
  7094  	}
  7095  	srvLim := &s.getOpts().JetStreamLimits
  7096  	// Make sure we have sane defaults
  7097  	setConsumerConfigDefaults(cfg, &streamCfg, srvLim, selectedLimits)
  7098  
  7099  	if err := checkConsumerCfg(cfg, srvLim, &streamCfg, acc, selectedLimits, false); err != nil {
  7100  		resp.Error = err
  7101  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7102  		return
  7103  	}
  7104  
  7105  	js.mu.Lock()
  7106  	defer js.mu.Unlock()
  7107  
  7108  	if cc.meta == nil {
  7109  		return
  7110  	}
  7111  
  7112  	// Lookup the stream assignment.
  7113  	sa := js.streamAssignment(acc.Name, stream)
  7114  	if sa == nil {
  7115  		resp.Error = NewJSStreamNotFoundError()
  7116  		s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7117  		return
  7118  	}
  7119  
  7120  	// Check for max consumers here to short circuit if possible.
  7121  	// Start with limit on a stream, but if one is defined at the level of the account
  7122  	// and is lower, use that limit.
  7123  	maxc := sa.Config.MaxConsumers
  7124  	if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) {
  7125  		maxc = selectedLimits.MaxConsumers
  7126  	}
  7127  	if maxc > 0 {
  7128  		// Don't count DIRECTS.
  7129  		total := 0
  7130  		for _, ca := range sa.consumers {
  7131  			if ca.Config != nil && !ca.Config.Direct {
  7132  				total++
  7133  			}
  7134  		}
  7135  		if total >= maxc {
  7136  			resp.Error = NewJSMaximumConsumersLimitError()
  7137  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7138  			return
  7139  		}
  7140  	}
  7141  
  7142  	// Also short circuit if DeliverLastPerSubject is set with no FilterSubject.
  7143  	if cfg.DeliverPolicy == DeliverLastPerSubject {
  7144  		if cfg.FilterSubject == _EMPTY_ && len(cfg.FilterSubjects) == 0 {
  7145  			resp.Error = NewJSConsumerInvalidPolicyError(fmt.Errorf("consumer delivery policy is deliver last per subject, but FilterSubject is not set"))
  7146  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7147  			return
  7148  		}
  7149  	}
  7150  
  7151  	// Setup proper default for ack wait if we are in explicit ack mode.
  7152  	if cfg.AckWait == 0 && (cfg.AckPolicy == AckExplicit || cfg.AckPolicy == AckAll) {
  7153  		cfg.AckWait = JsAckWaitDefault
  7154  	}
  7155  	// Setup default of -1, meaning no limit for MaxDeliver.
  7156  	if cfg.MaxDeliver == 0 {
  7157  		cfg.MaxDeliver = -1
  7158  	}
  7159  	// Set proper default for max ack pending if we are ack explicit and none has been set.
  7160  	if cfg.AckPolicy == AckExplicit && cfg.MaxAckPending == 0 {
  7161  		cfg.MaxAckPending = JsDefaultMaxAckPending
  7162  	}
  7163  
  7164  	var ca *consumerAssignment
  7165  	var oname string
  7166  
  7167  	// See if we have an existing one already under same durable name or
  7168  	// if name was set by the user.
  7169  	if isDurableConsumer(cfg) || cfg.Name != _EMPTY_ {
  7170  		if cfg.Name != _EMPTY_ {
  7171  			oname = cfg.Name
  7172  		} else {
  7173  			oname = cfg.Durable
  7174  		}
  7175  		if ca = sa.consumers[oname]; ca != nil && !ca.deleted {
  7176  			if action == ActionCreate && !reflect.DeepEqual(cfg, ca.Config) {
  7177  				resp.Error = NewJSConsumerAlreadyExistsError()
  7178  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7179  				return
  7180  			}
  7181  			// Do quick sanity check on new cfg to prevent here if possible.
  7182  			if err := acc.checkNewConsumerConfig(ca.Config, cfg); err != nil {
  7183  				resp.Error = NewJSConsumerCreateError(err, Unless(err))
  7184  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7185  				return
  7186  			}
  7187  		}
  7188  	}
  7189  
  7190  	// If this is new consumer.
  7191  	if ca == nil {
  7192  		if action == ActionUpdate {
  7193  			resp.Error = NewJSConsumerDoesNotExistError()
  7194  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7195  			return
  7196  		}
  7197  		rg := cc.createGroupForConsumer(cfg, sa)
  7198  		if rg == nil {
  7199  			resp.Error = NewJSInsufficientResourcesError()
  7200  			s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7201  			return
  7202  		}
  7203  		// Pick a preferred leader.
  7204  		rg.setPreferred()
  7205  
  7206  		// Inherit cluster from stream.
  7207  		rg.Cluster = sa.Group.Cluster
  7208  
  7209  		// We need to set the ephemeral here before replicating.
  7210  		if !isDurableConsumer(cfg) {
  7211  			// We chose to have ephemerals be R=1 unless stream is interest or workqueue.
  7212  			// Consumer can override.
  7213  			if sa.Config.Retention == LimitsPolicy && cfg.Replicas <= 1 {
  7214  				rg.Peers = []string{rg.Preferred}
  7215  				rg.Name = groupNameForConsumer(rg.Peers, rg.Storage)
  7216  			}
  7217  			if cfg.Name != _EMPTY_ {
  7218  				oname = cfg.Name
  7219  			} else {
  7220  				// Make sure name is unique.
  7221  				for {
  7222  					oname = createConsumerName()
  7223  					if sa.consumers != nil {
  7224  						if sa.consumers[oname] != nil {
  7225  							continue
  7226  						}
  7227  					}
  7228  					break
  7229  				}
  7230  			}
  7231  		}
  7232  		if len(rg.Peers) > 1 {
  7233  			if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets != 0 {
  7234  				for _, peer := range rg.Peers {
  7235  					if ni, ok := s.nodeToInfo.Load(peer); ok {
  7236  						ni := ni.(nodeInfo)
  7237  						if stats := ni.stats; stats != nil && stats.HAAssets > maxHaAssets {
  7238  							resp.Error = NewJSInsufficientResourcesError()
  7239  							s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7240  							s.Warnf("%s@%s (HA Asset Count: %d) exceeds max ha asset limit of %d"+
  7241  								" for (durable) consumer %s placement on stream %s",
  7242  								ni.name, ni.cluster, ni.stats.HAAssets, maxHaAssets, oname, stream)
  7243  							return
  7244  						}
  7245  					}
  7246  				}
  7247  			}
  7248  		}
  7249  
  7250  		// Check if we are work queue policy.
  7251  		// We will do pre-checks here to avoid thrashing meta layer.
  7252  		if sa.Config.Retention == WorkQueuePolicy && !cfg.Direct {
  7253  			if cfg.AckPolicy != AckExplicit {
  7254  				resp.Error = NewJSConsumerWQRequiresExplicitAckError()
  7255  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7256  				return
  7257  			}
  7258  			subjects := gatherSubjectFilters(cfg.FilterSubject, cfg.FilterSubjects)
  7259  			if len(subjects) == 0 && len(sa.consumers) > 0 {
  7260  				resp.Error = NewJSConsumerWQMultipleUnfilteredError()
  7261  				s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7262  				return
  7263  			}
  7264  			// Check here to make sure we have not collided with another.
  7265  			if len(sa.consumers) > 0 {
  7266  				for _, oca := range sa.consumers {
  7267  					if oca.Name == oname {
  7268  						continue
  7269  					}
  7270  					for _, psubj := range gatherSubjectFilters(oca.Config.FilterSubject, oca.Config.FilterSubjects) {
  7271  						for _, subj := range subjects {
  7272  							if SubjectsCollide(subj, psubj) {
  7273  								resp.Error = NewJSConsumerWQConsumerNotUniqueError()
  7274  								s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
  7275  								return
  7276  							}
  7277  						}
  7278  					}
  7279  				}
  7280  			}
  7281  		}
  7282  
  7283  		ca = &consumerAssignment{
  7284  			Group:   rg,
  7285  			Stream:  stream,
  7286  			Name:    oname,
  7287  			Config:  cfg,
  7288  			Subject: subject,
  7289  			Reply:   reply,
  7290  			Client:  ci,
  7291  			Created: time.Now().UTC(),
  7292  		}
  7293  	} else {
  7294  		// If the consumer already exists then don't allow updating the PauseUntil, just set
  7295  		// it back to whatever the current configured value is.
  7296  		cfg.PauseUntil = ca.Config.PauseUntil
  7297  
  7298  		nca := ca.copyGroup()
  7299  
  7300  		rBefore := nca.Config.replicas(sa.Config)
  7301  		rAfter := cfg.replicas(sa.Config)
  7302  
  7303  		var curLeader string
  7304  		if rBefore != rAfter {
  7305  			// We are modifying nodes here. We want to do our best to preserve the current leader.
  7306  			// We have support now from above that guarantees we are in our own Go routine, so can
  7307  			// ask for stream info from the stream leader to make sure we keep the leader in the new list.
  7308  			if !s.allPeersOffline(ca.Group) {
  7309  				// Need to release js lock.
  7310  				js.mu.Unlock()
  7311  				if ci, err := sysRequest[ConsumerInfo](s, clusterConsumerInfoT, ci.serviceAccount(), sa.Config.Name, cfg.Durable); err != nil {
  7312  					s.Warnf("Did not receive consumer info results for '%s > %s > %s' due to: %s", acc, sa.Config.Name, cfg.Durable, err)
  7313  				} else if ci != nil {
  7314  					if cl := ci.Cluster; cl != nil {
  7315  						curLeader = getHash(cl.Leader)
  7316  					}
  7317  				}
  7318  				// Re-acquire here.
  7319  				js.mu.Lock()
  7320  			}
  7321  		}
  7322  
  7323  		if rBefore < rAfter {
  7324  			newPeerSet := nca.Group.Peers
  7325  			// scale up by adding new members from the stream peer set that are not yet in the consumer peer set
  7326  			streamPeerSet := copyStrings(sa.Group.Peers)
  7327  			rand.Shuffle(rAfter, func(i, j int) { streamPeerSet[i], streamPeerSet[j] = streamPeerSet[j], streamPeerSet[i] })
  7328  			for _, p := range streamPeerSet {
  7329  				found := false
  7330  				for _, sp := range newPeerSet {
  7331  					if sp == p {
  7332  						found = true
  7333  						break
  7334  					}
  7335  				}
  7336  				if !found {
  7337  					newPeerSet = append(newPeerSet, p)
  7338  					if len(newPeerSet) == rAfter {
  7339  						break
  7340  					}
  7341  				}
  7342  			}
  7343  			nca.Group.Peers = newPeerSet
  7344  			nca.Group.Preferred = curLeader
  7345  		} else if rBefore > rAfter {
  7346  			newPeerSet := nca.Group.Peers
  7347  			// mark leader preferred and move it to end
  7348  			nca.Group.Preferred = curLeader
  7349  			if nca.Group.Preferred != _EMPTY_ {
  7350  				for i, p := range newPeerSet {
  7351  					if nca.Group.Preferred == p {
  7352  						newPeerSet[i] = newPeerSet[len(newPeerSet)-1]
  7353  						newPeerSet[len(newPeerSet)-1] = p
  7354  					}
  7355  				}
  7356  			}
  7357  			// scale down by removing peers from the end
  7358  			newPeerSet = newPeerSet[len(newPeerSet)-rAfter:]
  7359  			nca.Group.Peers = newPeerSet
  7360  		}
  7361  
  7362  		// Update config and client info on copy of existing.
  7363  		nca.Config = cfg
  7364  		nca.Client = ci
  7365  		nca.Subject = subject
  7366  		nca.Reply = reply
  7367  		ca = nca
  7368  	}
  7369  
  7370  	// Mark this as pending.
  7371  	if sa.consumers == nil {
  7372  		sa.consumers = make(map[string]*consumerAssignment)
  7373  	}
  7374  	sa.consumers[ca.Name] = ca
  7375  
  7376  	// Do formal proposal.
  7377  	cc.meta.Propose(encodeAddConsumerAssignment(ca))
  7378  }
  7379  
  7380  func encodeAddConsumerAssignment(ca *consumerAssignment) []byte {
  7381  	var bb bytes.Buffer
  7382  	bb.WriteByte(byte(assignConsumerOp))
  7383  	json.NewEncoder(&bb).Encode(ca)
  7384  	return bb.Bytes()
  7385  }
  7386  
  7387  func encodeDeleteConsumerAssignment(ca *consumerAssignment) []byte {
  7388  	var bb bytes.Buffer
  7389  	bb.WriteByte(byte(removeConsumerOp))
  7390  	json.NewEncoder(&bb).Encode(ca)
  7391  	return bb.Bytes()
  7392  }
  7393  
  7394  func decodeConsumerAssignment(buf []byte) (*consumerAssignment, error) {
  7395  	var ca consumerAssignment
  7396  	err := json.Unmarshal(buf, &ca)
  7397  	return &ca, err
  7398  }
  7399  
  7400  func encodeAddConsumerAssignmentCompressed(ca *consumerAssignment) []byte {
  7401  	b, err := json.Marshal(ca)
  7402  	if err != nil {
  7403  		return nil
  7404  	}
  7405  	// TODO(dlc) - Streaming better approach here probably.
  7406  	var bb bytes.Buffer
  7407  	bb.WriteByte(byte(assignCompressedConsumerOp))
  7408  	bb.Write(s2.Encode(nil, b))
  7409  	return bb.Bytes()
  7410  }
  7411  
  7412  func decodeConsumerAssignmentCompressed(buf []byte) (*consumerAssignment, error) {
  7413  	var ca consumerAssignment
  7414  	js, err := s2.Decode(nil, buf)
  7415  	if err != nil {
  7416  		return nil, err
  7417  	}
  7418  	err = json.Unmarshal(js, &ca)
  7419  	return &ca, err
  7420  }
  7421  
  7422  var errBadStreamMsg = errors.New("jetstream cluster bad replicated stream msg")
  7423  
  7424  func decodeStreamMsg(buf []byte) (subject, reply string, hdr, msg []byte, lseq uint64, ts int64, err error) {
  7425  	var le = binary.LittleEndian
  7426  	if len(buf) < 26 {
  7427  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7428  	}
  7429  	lseq = le.Uint64(buf)
  7430  	buf = buf[8:]
  7431  	ts = int64(le.Uint64(buf))
  7432  	buf = buf[8:]
  7433  	sl := int(le.Uint16(buf))
  7434  	buf = buf[2:]
  7435  	if len(buf) < sl {
  7436  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7437  	}
  7438  	subject = string(buf[:sl])
  7439  	buf = buf[sl:]
  7440  	if len(buf) < 2 {
  7441  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7442  	}
  7443  	rl := int(le.Uint16(buf))
  7444  	buf = buf[2:]
  7445  	if len(buf) < rl {
  7446  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7447  	}
  7448  	reply = string(buf[:rl])
  7449  	buf = buf[rl:]
  7450  	if len(buf) < 2 {
  7451  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7452  	}
  7453  	hl := int(le.Uint16(buf))
  7454  	buf = buf[2:]
  7455  	if len(buf) < hl {
  7456  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7457  	}
  7458  	if hdr = buf[:hl]; len(hdr) == 0 {
  7459  		hdr = nil
  7460  	}
  7461  	buf = buf[hl:]
  7462  	if len(buf) < 4 {
  7463  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7464  	}
  7465  	ml := int(le.Uint32(buf))
  7466  	buf = buf[4:]
  7467  	if len(buf) < ml {
  7468  		return _EMPTY_, _EMPTY_, nil, nil, 0, 0, errBadStreamMsg
  7469  	}
  7470  	if msg = buf[:ml]; len(msg) == 0 {
  7471  		msg = nil
  7472  	}
  7473  	return subject, reply, hdr, msg, lseq, ts, nil
  7474  }
  7475  
  7476  // Helper to return if compression allowed.
  7477  func (mset *stream) compressAllowed() bool {
  7478  	mset.clMu.Lock()
  7479  	defer mset.clMu.Unlock()
  7480  	return mset.compressOK
  7481  }
  7482  
  7483  func encodeStreamMsg(subject, reply string, hdr, msg []byte, lseq uint64, ts int64) []byte {
  7484  	return encodeStreamMsgAllowCompress(subject, reply, hdr, msg, lseq, ts, false)
  7485  }
  7486  
  7487  // Threshold for compression.
  7488  // TODO(dlc) - Eventually make configurable.
  7489  const compressThreshold = 256
  7490  
  7491  // If allowed and contents over the threshold we will compress.
  7492  func encodeStreamMsgAllowCompress(subject, reply string, hdr, msg []byte, lseq uint64, ts int64, compressOK bool) []byte {
  7493  	shouldCompress := compressOK && len(subject)+len(reply)+len(hdr)+len(msg) > compressThreshold
  7494  
  7495  	elen := 1 + 8 + 8 + len(subject) + len(reply) + len(hdr) + len(msg)
  7496  	elen += (2 + 2 + 2 + 4) // Encoded lengths, 4bytes
  7497  	// TODO(dlc) - check sizes of subject, reply and hdr, make sure uint16 ok.
  7498  	buf := make([]byte, elen)
  7499  	buf[0] = byte(streamMsgOp)
  7500  	var le = binary.LittleEndian
  7501  	wi := 1
  7502  	le.PutUint64(buf[wi:], lseq)
  7503  	wi += 8
  7504  	le.PutUint64(buf[wi:], uint64(ts))
  7505  	wi += 8
  7506  	le.PutUint16(buf[wi:], uint16(len(subject)))
  7507  	wi += 2
  7508  	copy(buf[wi:], subject)
  7509  	wi += len(subject)
  7510  	le.PutUint16(buf[wi:], uint16(len(reply)))
  7511  	wi += 2
  7512  	copy(buf[wi:], reply)
  7513  	wi += len(reply)
  7514  	le.PutUint16(buf[wi:], uint16(len(hdr)))
  7515  	wi += 2
  7516  	if len(hdr) > 0 {
  7517  		copy(buf[wi:], hdr)
  7518  		wi += len(hdr)
  7519  	}
  7520  	le.PutUint32(buf[wi:], uint32(len(msg)))
  7521  	wi += 4
  7522  	if len(msg) > 0 {
  7523  		copy(buf[wi:], msg)
  7524  		wi += len(msg)
  7525  	}
  7526  
  7527  	// Check if we should compress.
  7528  	if shouldCompress {
  7529  		nbuf := make([]byte, s2.MaxEncodedLen(elen))
  7530  		nbuf[0] = byte(compressedStreamMsgOp)
  7531  		ebuf := s2.Encode(nbuf[1:], buf[1:wi])
  7532  		// Only pay cost of decode the other side if we compressed.
  7533  		// S2 will allow us to try without major penalty for non-compressable data.
  7534  		if len(ebuf) < wi {
  7535  			nbuf = nbuf[:len(ebuf)+1]
  7536  			buf, wi = nbuf, len(nbuf)
  7537  		}
  7538  	}
  7539  
  7540  	return buf[:wi]
  7541  }
  7542  
  7543  // Determine if all peers in our set support the binary snapshot.
  7544  func (mset *stream) supportsBinarySnapshot() bool {
  7545  	mset.mu.RLock()
  7546  	defer mset.mu.RUnlock()
  7547  	return mset.supportsBinarySnapshotLocked()
  7548  }
  7549  
  7550  // Determine if all peers in our set support the binary snapshot.
  7551  // Lock should be held.
  7552  func (mset *stream) supportsBinarySnapshotLocked() bool {
  7553  	s, n := mset.srv, mset.node
  7554  	if s == nil || n == nil {
  7555  		return false
  7556  	}
  7557  	// Grab our peers and walk them to make sure we can all support binary stream snapshots.
  7558  	id, peers := n.ID(), n.Peers()
  7559  	for _, p := range peers {
  7560  		if p.ID == id {
  7561  			// We know we support ourselves.
  7562  			continue
  7563  		}
  7564  		if sir, ok := s.nodeToInfo.Load(p.ID); !ok || sir == nil || !sir.(nodeInfo).binarySnapshots {
  7565  			return false
  7566  		}
  7567  	}
  7568  	return true
  7569  }
  7570  
  7571  // StreamSnapshot is used for snapshotting and out of band catch up in clustered mode.
  7572  // Legacy, replace with binary stream snapshots.
  7573  type streamSnapshot struct {
  7574  	Msgs     uint64   `json:"messages"`
  7575  	Bytes    uint64   `json:"bytes"`
  7576  	FirstSeq uint64   `json:"first_seq"`
  7577  	LastSeq  uint64   `json:"last_seq"`
  7578  	Failed   uint64   `json:"clfs"`
  7579  	Deleted  []uint64 `json:"deleted,omitempty"`
  7580  }
  7581  
  7582  // Grab a snapshot of a stream for clustered mode.
  7583  func (mset *stream) stateSnapshot() []byte {
  7584  	mset.mu.RLock()
  7585  	defer mset.mu.RUnlock()
  7586  	return mset.stateSnapshotLocked()
  7587  }
  7588  
  7589  // Grab a snapshot of a stream for clustered mode.
  7590  // Lock should be held.
  7591  func (mset *stream) stateSnapshotLocked() []byte {
  7592  	// Decide if we can support the new style of stream snapshots.
  7593  	if mset.supportsBinarySnapshotLocked() {
  7594  		snap, _ := mset.store.EncodedStreamState(mset.getCLFS())
  7595  		return snap
  7596  	}
  7597  
  7598  	// Older v1 version with deleted as a sorted []uint64.
  7599  	state := mset.store.State()
  7600  	snap := &streamSnapshot{
  7601  		Msgs:     state.Msgs,
  7602  		Bytes:    state.Bytes,
  7603  		FirstSeq: state.FirstSeq,
  7604  		LastSeq:  state.LastSeq,
  7605  		Failed:   mset.getCLFS(),
  7606  		Deleted:  state.Deleted,
  7607  	}
  7608  	b, _ := json.Marshal(snap)
  7609  	return b
  7610  }
  7611  
  7612  // Will check if we can do message compression in RAFT and catchup logic.
  7613  func (mset *stream) checkAllowMsgCompress(peers []string) {
  7614  	allowed := true
  7615  	for _, id := range peers {
  7616  		sir, ok := mset.srv.nodeToInfo.Load(id)
  7617  		if !ok || sir == nil {
  7618  			allowed = false
  7619  			break
  7620  		}
  7621  		// Check for capability.
  7622  		if si := sir.(nodeInfo); si.cfg == nil || !si.cfg.CompressOK {
  7623  			allowed = false
  7624  			break
  7625  		}
  7626  	}
  7627  	mset.mu.Lock()
  7628  	mset.compressOK = allowed
  7629  	mset.mu.Unlock()
  7630  }
  7631  
  7632  // To warn when we are getting too far behind from what has been proposed vs what has been committed.
  7633  const streamLagWarnThreshold = 10_000
  7634  
  7635  // processClusteredInboundMsg will propose the inbound message to the underlying raft group.
  7636  func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg []byte, mt *msgTrace) (retErr error) {
  7637  	// For possible error response.
  7638  	var response []byte
  7639  
  7640  	mset.mu.RLock()
  7641  	canRespond := !mset.cfg.NoAck && len(reply) > 0
  7642  	name, stype, store := mset.cfg.Name, mset.cfg.Storage, mset.store
  7643  	s, js, jsa, st, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node
  7644  	maxMsgSize, lseq := int(mset.cfg.MaxMsgSize), mset.lseq
  7645  	interestPolicy, discard, maxMsgs, maxBytes := mset.cfg.Retention != LimitsPolicy, mset.cfg.Discard, mset.cfg.MaxMsgs, mset.cfg.MaxBytes
  7646  	isLeader, isSealed := mset.isLeader(), mset.cfg.Sealed
  7647  	mset.mu.RUnlock()
  7648  
  7649  	// This should not happen but possible now that we allow scale up, and scale down where this could trigger.
  7650  	//
  7651  	// We also invoke this in clustering mode for message tracing when not
  7652  	// performing message delivery.
  7653  	if node == nil || mt.traceOnly() {
  7654  		return mset.processJetStreamMsg(subject, reply, hdr, msg, 0, 0, mt)
  7655  	}
  7656  
  7657  	// If message tracing (with message delivery), we will need to send the
  7658  	// event on exit in case there was an error (if message was not proposed).
  7659  	// Otherwise, the event will be sent from processJetStreamMsg when
  7660  	// invoked by the leader (from applyStreamEntries).
  7661  	if mt != nil {
  7662  		defer func() {
  7663  			if retErr != nil {
  7664  				mt.sendEventFromJetStream(retErr)
  7665  			}
  7666  		}()
  7667  	}
  7668  
  7669  	// Check that we are the leader. This can be false if we have scaled up from an R1 that had inbound queued messages.
  7670  	if !isLeader {
  7671  		return NewJSClusterNotLeaderError()
  7672  	}
  7673  
  7674  	// Bail here if sealed.
  7675  	if isSealed {
  7676  		var resp = JSPubAckResponse{PubAck: &PubAck{Stream: mset.name()}, Error: NewJSStreamSealedError()}
  7677  		b, _ := json.Marshal(resp)
  7678  		mset.outq.sendMsg(reply, b)
  7679  		return NewJSStreamSealedError()
  7680  	}
  7681  
  7682  	// Check here pre-emptively if we have exceeded this server limits.
  7683  	if js.limitsExceeded(stype) {
  7684  		s.resourcesExceededError()
  7685  		if canRespond {
  7686  			b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: NewJSInsufficientResourcesError()})
  7687  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0))
  7688  		}
  7689  		// Stepdown regardless.
  7690  		if node := mset.raftNode(); node != nil {
  7691  			node.StepDown()
  7692  		}
  7693  		return NewJSInsufficientResourcesError()
  7694  	}
  7695  
  7696  	// Check here pre-emptively if we have exceeded our account limits.
  7697  	if exceeded, err := jsa.wouldExceedLimits(st, tierName, r, subject, hdr, msg); exceeded {
  7698  		if err == nil {
  7699  			err = NewJSAccountResourcesExceededError()
  7700  		}
  7701  		s.RateLimitWarnf(err.Error())
  7702  		if canRespond {
  7703  			var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7704  			resp.Error = err
  7705  			response, _ = json.Marshal(resp)
  7706  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7707  		}
  7708  		return err
  7709  	}
  7710  
  7711  	// Check msgSize if we have a limit set there. Again this works if it goes through but better to be pre-emptive.
  7712  	if maxMsgSize >= 0 && (len(hdr)+len(msg)) > maxMsgSize {
  7713  		err := fmt.Errorf("JetStream message size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name)
  7714  		s.RateLimitWarnf(err.Error())
  7715  		if canRespond {
  7716  			var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7717  			resp.Error = NewJSStreamMessageExceedsMaximumError()
  7718  			response, _ = json.Marshal(resp)
  7719  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7720  		}
  7721  		return err
  7722  	}
  7723  
  7724  	// Some header checks can be checked pre proposal. Most can not.
  7725  	var msgId string
  7726  	if len(hdr) > 0 {
  7727  		// Since we encode header len as u16 make sure we do not exceed.
  7728  		// Again this works if it goes through but better to be pre-emptive.
  7729  		if len(hdr) > math.MaxUint16 {
  7730  			err := fmt.Errorf("JetStream header size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name)
  7731  			s.RateLimitWarnf(err.Error())
  7732  			if canRespond {
  7733  				var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7734  				resp.Error = NewJSStreamHeaderExceedsMaximumError()
  7735  				response, _ = json.Marshal(resp)
  7736  				outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7737  			}
  7738  			return err
  7739  		}
  7740  		// Expected last sequence per subject.
  7741  		// We can check for last sequence per subject but only if the expected seq <= lseq.
  7742  		if seq, exists := getExpectedLastSeqPerSubject(hdr); exists && store != nil && seq > 0 && seq <= lseq {
  7743  			var smv StoreMsg
  7744  			var fseq uint64
  7745  			sm, err := store.LoadLastMsg(subject, &smv)
  7746  			if sm != nil {
  7747  				fseq = sm.seq
  7748  			}
  7749  			if err != nil || fseq != seq {
  7750  				if canRespond {
  7751  					var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7752  					resp.PubAck = &PubAck{Stream: name}
  7753  					resp.Error = NewJSStreamWrongLastSequenceError(fseq)
  7754  					b, _ := json.Marshal(resp)
  7755  					outq.sendMsg(reply, b)
  7756  				}
  7757  				return fmt.Errorf("last sequence by subject mismatch: %d vs %d", seq, fseq)
  7758  			}
  7759  		}
  7760  		// Expected stream name can also be pre-checked.
  7761  		if sname := getExpectedStream(hdr); sname != _EMPTY_ && sname != name {
  7762  			if canRespond {
  7763  				var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7764  				resp.PubAck = &PubAck{Stream: name}
  7765  				resp.Error = NewJSStreamNotMatchError()
  7766  				b, _ := json.Marshal(resp)
  7767  				outq.sendMsg(reply, b)
  7768  			}
  7769  			return errStreamMismatch
  7770  		}
  7771  		// Check for MsgIds here at the cluster level to avoid excessive CLFS accounting.
  7772  		// Will help during restarts.
  7773  		if msgId = getMsgId(hdr); msgId != _EMPTY_ {
  7774  			mset.mu.Lock()
  7775  			if dde := mset.checkMsgId(msgId); dde != nil {
  7776  				var buf [256]byte
  7777  				pubAck := append(buf[:0], mset.pubAck...)
  7778  				seq := dde.seq
  7779  				mset.mu.Unlock()
  7780  				if canRespond {
  7781  					response := append(pubAck, strconv.FormatUint(seq, 10)...)
  7782  					response = append(response, ",\"duplicate\": true}"...)
  7783  					outq.sendMsg(reply, response)
  7784  				}
  7785  				return errMsgIdDuplicate
  7786  			}
  7787  			// FIXME(dlc) - locking conflict with accessing mset.clseq
  7788  			// For now we stage with zero, and will update in processStreamMsg.
  7789  			mset.storeMsgIdLocked(&ddentry{msgId, 0, time.Now().UnixNano()})
  7790  			mset.mu.Unlock()
  7791  		}
  7792  	}
  7793  
  7794  	// Proceed with proposing this message.
  7795  
  7796  	// We only use mset.clseq for clustering and in case we run ahead of actual commits.
  7797  	// Check if we need to set initial value here
  7798  	mset.clMu.Lock()
  7799  	if mset.clseq == 0 || mset.clseq < lseq+mset.clfs {
  7800  		// Re-capture
  7801  		lseq = mset.lastSeq()
  7802  		mset.clseq = lseq + mset.clfs
  7803  	}
  7804  
  7805  	// Check if we have an interest policy and discard new with max msgs or bytes.
  7806  	// We need to deny here otherwise it could succeed on some peers and not others
  7807  	// depending on consumer ack state. So we deny here, if we allow that means we know
  7808  	// it would succeed on every peer.
  7809  	if interestPolicy && discard == DiscardNew && (maxMsgs > 0 || maxBytes > 0) {
  7810  		// Track inflight.
  7811  		if mset.inflight == nil {
  7812  			mset.inflight = make(map[uint64]uint64)
  7813  		}
  7814  		if stype == FileStorage {
  7815  			mset.inflight[mset.clseq] = fileStoreMsgSize(subject, hdr, msg)
  7816  		} else {
  7817  			mset.inflight[mset.clseq] = memStoreMsgSize(subject, hdr, msg)
  7818  		}
  7819  
  7820  		var state StreamState
  7821  		mset.store.FastState(&state)
  7822  
  7823  		var err error
  7824  		if maxMsgs > 0 && state.Msgs+uint64(len(mset.inflight)) > uint64(maxMsgs) {
  7825  			err = ErrMaxMsgs
  7826  		} else if maxBytes > 0 {
  7827  			// TODO(dlc) - Could track this rollup independently.
  7828  			var bytesPending uint64
  7829  			for _, nb := range mset.inflight {
  7830  				bytesPending += nb
  7831  			}
  7832  			if state.Bytes+bytesPending > uint64(maxBytes) {
  7833  				err = ErrMaxBytes
  7834  			}
  7835  		}
  7836  		if err != nil {
  7837  			delete(mset.inflight, mset.clseq)
  7838  			mset.clMu.Unlock()
  7839  			if canRespond {
  7840  				var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
  7841  				resp.Error = NewJSStreamStoreFailedError(err, Unless(err))
  7842  				response, _ = json.Marshal(resp)
  7843  				outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7844  			}
  7845  			return err
  7846  		}
  7847  	}
  7848  
  7849  	esm := encodeStreamMsgAllowCompress(subject, reply, hdr, msg, mset.clseq, time.Now().UnixNano(), mset.compressOK)
  7850  	var mtKey uint64
  7851  	if mt != nil {
  7852  		mtKey = mset.clseq
  7853  		if mset.mt == nil {
  7854  			mset.mt = make(map[uint64]*msgTrace)
  7855  		}
  7856  		mset.mt[mtKey] = mt
  7857  	}
  7858  
  7859  	// Do proposal.
  7860  	err := node.Propose(esm)
  7861  	if err == nil {
  7862  		mset.clseq++
  7863  	}
  7864  
  7865  	// Check to see if we are being overrun.
  7866  	// TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured.
  7867  	if mset.clseq-(lseq+mset.clfs) > streamLagWarnThreshold {
  7868  		lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name)
  7869  		s.RateLimitWarnf(lerr.Error())
  7870  	}
  7871  	mset.clMu.Unlock()
  7872  
  7873  	if err != nil {
  7874  		if mt != nil {
  7875  			mset.getAndDeleteMsgTrace(mtKey)
  7876  		}
  7877  		if canRespond {
  7878  			var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: mset.cfg.Name}}
  7879  			resp.Error = &ApiError{Code: 503, Description: err.Error()}
  7880  			response, _ = json.Marshal(resp)
  7881  			// If we errored out respond here.
  7882  			outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0))
  7883  		}
  7884  		if isOutOfSpaceErr(err) {
  7885  			s.handleOutOfSpace(mset)
  7886  		}
  7887  	}
  7888  
  7889  	return err
  7890  }
  7891  
  7892  func (mset *stream) getAndDeleteMsgTrace(lseq uint64) *msgTrace {
  7893  	if mset == nil {
  7894  		return nil
  7895  	}
  7896  	mset.clMu.Lock()
  7897  	mt, ok := mset.mt[lseq]
  7898  	if ok {
  7899  		delete(mset.mt, lseq)
  7900  	}
  7901  	mset.clMu.Unlock()
  7902  	return mt
  7903  }
  7904  
  7905  // For requesting messages post raft snapshot to catch up streams post server restart.
  7906  // Any deleted msgs etc will be handled inline on catchup.
  7907  type streamSyncRequest struct {
  7908  	Peer           string `json:"peer,omitempty"`
  7909  	FirstSeq       uint64 `json:"first_seq"`
  7910  	LastSeq        uint64 `json:"last_seq"`
  7911  	DeleteRangesOk bool   `json:"delete_ranges"`
  7912  }
  7913  
  7914  // Given a stream state that represents a snapshot, calculate the sync request based on our current state.
  7915  // Stream lock must be held.
  7916  func (mset *stream) calculateSyncRequest(state *StreamState, snap *StreamReplicatedState) *streamSyncRequest {
  7917  	// Shouldn't happen, but consequences are pretty bad if we have the lock held and
  7918  	// our caller tries to take the lock again on panic defer, as in processSnapshot.
  7919  	if state == nil || snap == nil || mset.node == nil {
  7920  		return nil
  7921  	}
  7922  	// Quick check if we are already caught up.
  7923  	if state.LastSeq >= snap.LastSeq {
  7924  		return nil
  7925  	}
  7926  	return &streamSyncRequest{FirstSeq: state.LastSeq + 1, LastSeq: snap.LastSeq, Peer: mset.node.ID(), DeleteRangesOk: true}
  7927  }
  7928  
  7929  // processSnapshotDeletes will update our current store based on the snapshot
  7930  // but only processing deletes and new FirstSeq / purges.
  7931  func (mset *stream) processSnapshotDeletes(snap *StreamReplicatedState) {
  7932  	mset.mu.Lock()
  7933  	var state StreamState
  7934  	mset.store.FastState(&state)
  7935  	// Always adjust if FirstSeq has moved beyond our state.
  7936  	var didReset bool
  7937  	if snap.FirstSeq > state.FirstSeq {
  7938  		mset.store.Compact(snap.FirstSeq)
  7939  		mset.store.FastState(&state)
  7940  		mset.lseq = state.LastSeq
  7941  		mset.clearAllPreAcksBelowFloor(state.FirstSeq)
  7942  		didReset = true
  7943  	}
  7944  	s := mset.srv
  7945  	mset.mu.Unlock()
  7946  
  7947  	if didReset {
  7948  		s.Warnf("Catchup for stream '%s > %s' resetting first sequence: %d on catchup request",
  7949  			mset.account(), mset.name(), snap.FirstSeq)
  7950  	}
  7951  
  7952  	if len(snap.Deleted) > 0 {
  7953  		mset.store.SyncDeleted(snap.Deleted)
  7954  	}
  7955  }
  7956  
  7957  func (mset *stream) setCatchupPeer(peer string, lag uint64) {
  7958  	if peer == _EMPTY_ {
  7959  		return
  7960  	}
  7961  	mset.mu.Lock()
  7962  	if mset.catchups == nil {
  7963  		mset.catchups = make(map[string]uint64)
  7964  	}
  7965  	mset.catchups[peer] = lag
  7966  	mset.mu.Unlock()
  7967  }
  7968  
  7969  // Will decrement by one.
  7970  func (mset *stream) updateCatchupPeer(peer string) {
  7971  	if peer == _EMPTY_ {
  7972  		return
  7973  	}
  7974  	mset.mu.Lock()
  7975  	if lag := mset.catchups[peer]; lag > 0 {
  7976  		mset.catchups[peer] = lag - 1
  7977  	}
  7978  	mset.mu.Unlock()
  7979  }
  7980  
  7981  func (mset *stream) decrementCatchupPeer(peer string, num uint64) {
  7982  	if peer == _EMPTY_ {
  7983  		return
  7984  	}
  7985  	mset.mu.Lock()
  7986  	if lag := mset.catchups[peer]; lag > 0 {
  7987  		if lag >= num {
  7988  			lag -= num
  7989  		} else {
  7990  			lag = 0
  7991  		}
  7992  		mset.catchups[peer] = lag
  7993  	}
  7994  	mset.mu.Unlock()
  7995  }
  7996  
  7997  func (mset *stream) clearCatchupPeer(peer string) {
  7998  	mset.mu.Lock()
  7999  	if mset.catchups != nil {
  8000  		delete(mset.catchups, peer)
  8001  	}
  8002  	mset.mu.Unlock()
  8003  }
  8004  
  8005  // Lock should be held.
  8006  func (mset *stream) clearAllCatchupPeers() {
  8007  	if mset.catchups != nil {
  8008  		mset.catchups = nil
  8009  	}
  8010  }
  8011  
  8012  func (mset *stream) lagForCatchupPeer(peer string) uint64 {
  8013  	mset.mu.RLock()
  8014  	defer mset.mu.RUnlock()
  8015  	if mset.catchups == nil {
  8016  		return 0
  8017  	}
  8018  	return mset.catchups[peer]
  8019  }
  8020  
  8021  func (mset *stream) hasCatchupPeers() bool {
  8022  	mset.mu.RLock()
  8023  	defer mset.mu.RUnlock()
  8024  	return len(mset.catchups) > 0
  8025  }
  8026  
  8027  func (mset *stream) setCatchingUp() {
  8028  	mset.catchup.Store(true)
  8029  }
  8030  
  8031  func (mset *stream) clearCatchingUp() {
  8032  	mset.catchup.Store(false)
  8033  }
  8034  
  8035  func (mset *stream) isCatchingUp() bool {
  8036  	return mset.catchup.Load()
  8037  }
  8038  
  8039  // Determine if a non-leader is current.
  8040  // Lock should be held.
  8041  func (mset *stream) isCurrent() bool {
  8042  	if mset.node == nil {
  8043  		return true
  8044  	}
  8045  	return mset.node.Current() && !mset.catchup.Load()
  8046  }
  8047  
  8048  // Maximum requests for the whole server that can be in flight at the same time.
  8049  const maxConcurrentSyncRequests = 16
  8050  
  8051  var (
  8052  	errCatchupCorruptSnapshot = errors.New("corrupt stream snapshot detected")
  8053  	errCatchupStalled         = errors.New("catchup stalled")
  8054  	errCatchupStreamStopped   = errors.New("stream has been stopped") // when a catchup is terminated due to the stream going away.
  8055  	errCatchupBadMsg          = errors.New("bad catchup msg")
  8056  	errCatchupWrongSeqForSkip = errors.New("wrong sequence for skipped msg")
  8057  )
  8058  
  8059  // Process a stream snapshot.
  8060  func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) {
  8061  	// Update any deletes, etc.
  8062  	mset.processSnapshotDeletes(snap)
  8063  	mset.setCLFS(snap.Failed)
  8064  
  8065  	mset.mu.Lock()
  8066  	var state StreamState
  8067  	mset.store.FastState(&state)
  8068  	sreq := mset.calculateSyncRequest(&state, snap)
  8069  
  8070  	s, js, subject, n, st := mset.srv, mset.js, mset.sa.Sync, mset.node, mset.cfg.Storage
  8071  	qname := fmt.Sprintf("[ACC:%s] stream '%s' snapshot", mset.acc.Name, mset.cfg.Name)
  8072  	mset.mu.Unlock()
  8073  
  8074  	// Bug that would cause this to be empty on stream update.
  8075  	if subject == _EMPTY_ {
  8076  		return errCatchupCorruptSnapshot
  8077  	}
  8078  
  8079  	// Just return if up to date or already exceeded limits.
  8080  	if sreq == nil || js.limitsExceeded(st) {
  8081  		return nil
  8082  	}
  8083  
  8084  	// Pause the apply channel for our raft group while we catch up.
  8085  	if err := n.PauseApply(); err != nil {
  8086  		return err
  8087  	}
  8088  
  8089  	defer func() {
  8090  		// Don't bother resuming if server or stream is gone.
  8091  		if e != errCatchupStreamStopped && e != ErrServerNotRunning {
  8092  			n.ResumeApply()
  8093  		}
  8094  	}()
  8095  
  8096  	// Set our catchup state.
  8097  	mset.setCatchingUp()
  8098  	defer mset.clearCatchingUp()
  8099  
  8100  	var sub *subscription
  8101  	var err error
  8102  
  8103  	const activityInterval = 30 * time.Second
  8104  	notActive := time.NewTimer(activityInterval)
  8105  	defer notActive.Stop()
  8106  
  8107  	defer func() {
  8108  		if sub != nil {
  8109  			s.sysUnsubscribe(sub)
  8110  		}
  8111  		// Make sure any consumers are updated for the pending amounts.
  8112  		mset.mu.Lock()
  8113  		for _, o := range mset.consumers {
  8114  			o.mu.Lock()
  8115  			if o.isLeader() {
  8116  				o.streamNumPending()
  8117  			}
  8118  			o.mu.Unlock()
  8119  		}
  8120  		mset.mu.Unlock()
  8121  	}()
  8122  
  8123  	var releaseSem bool
  8124  	releaseSyncOutSem := func() {
  8125  		if !releaseSem {
  8126  			return
  8127  		}
  8128  		// Need to use select for the server shutdown case.
  8129  		select {
  8130  		case s.syncOutSem <- struct{}{}:
  8131  		default:
  8132  		}
  8133  		releaseSem = false
  8134  	}
  8135  	// On exit, we will release our semaphore if we acquired it.
  8136  	defer releaseSyncOutSem()
  8137  
  8138  	// Do not let this go on forever.
  8139  	const maxRetries = 3
  8140  	var numRetries int
  8141  
  8142  RETRY:
  8143  	// On retry, we need to release the semaphore we got. Call will be no-op
  8144  	// if releaseSem boolean has not been set to true on successfully getting
  8145  	// the semaphore.
  8146  	releaseSyncOutSem()
  8147  
  8148  	if n.GroupLeader() == _EMPTY_ {
  8149  		return fmt.Errorf("catchup for stream '%s > %s' aborted, no leader", mset.account(), mset.name())
  8150  	}
  8151  
  8152  	// If we have a sub clear that here.
  8153  	if sub != nil {
  8154  		s.sysUnsubscribe(sub)
  8155  		sub = nil
  8156  	}
  8157  
  8158  	if !s.isRunning() {
  8159  		return ErrServerNotRunning
  8160  	}
  8161  
  8162  	numRetries++
  8163  	if numRetries >= maxRetries {
  8164  		// Force a hard reset here.
  8165  		return errFirstSequenceMismatch
  8166  	}
  8167  
  8168  	// Block here if we have too many requests in flight.
  8169  	<-s.syncOutSem
  8170  	releaseSem = true
  8171  
  8172  	// We may have been blocked for a bit, so the reset needs to ensure that we
  8173  	// consume the already fired timer.
  8174  	if !notActive.Stop() {
  8175  		select {
  8176  		case <-notActive.C:
  8177  		default:
  8178  		}
  8179  	}
  8180  	notActive.Reset(activityInterval)
  8181  
  8182  	// Grab sync request again on failures.
  8183  	if sreq == nil {
  8184  		mset.mu.RLock()
  8185  		var state StreamState
  8186  		mset.store.FastState(&state)
  8187  		sreq = mset.calculateSyncRequest(&state, snap)
  8188  		mset.mu.RUnlock()
  8189  		if sreq == nil {
  8190  			return nil
  8191  		}
  8192  	}
  8193  
  8194  	// Used to transfer message from the wire to another Go routine internally.
  8195  	type im struct {
  8196  		msg   []byte
  8197  		reply string
  8198  	}
  8199  	// This is used to notify the leader that it should stop the runCatchup
  8200  	// because we are either bailing out or going to retry due to an error.
  8201  	notifyLeaderStopCatchup := func(mrec *im, err error) {
  8202  		if mrec.reply == _EMPTY_ {
  8203  			return
  8204  		}
  8205  		s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, err.Error())
  8206  	}
  8207  
  8208  	msgsQ := newIPQueue[*im](s, qname)
  8209  	defer msgsQ.unregister()
  8210  
  8211  	// Send our catchup request here.
  8212  	reply := syncReplySubject()
  8213  	sub, err = s.sysSubscribe(reply, func(_ *subscription, _ *client, _ *Account, _, reply string, msg []byte) {
  8214  		// Make copy since we are using a buffer from the inbound client/route.
  8215  		msgsQ.push(&im{copyBytes(msg), reply})
  8216  	})
  8217  	if err != nil {
  8218  		s.Errorf("Could not subscribe to stream catchup: %v", err)
  8219  		goto RETRY
  8220  	}
  8221  
  8222  	// Send our sync request.
  8223  	b, _ := json.Marshal(sreq)
  8224  	s.sendInternalMsgLocked(subject, reply, nil, b)
  8225  	// Remember when we sent this out to avoid loop spins on errors below.
  8226  	reqSendTime := time.Now()
  8227  	// Clear our sync request.
  8228  	sreq = nil
  8229  
  8230  	// Run our own select loop here.
  8231  	for qch, lch := n.QuitC(), n.LeadChangeC(); ; {
  8232  		select {
  8233  		case <-msgsQ.ch:
  8234  			notActive.Reset(activityInterval)
  8235  
  8236  			mrecs := msgsQ.pop()
  8237  			for _, mrec := range mrecs {
  8238  				msg := mrec.msg
  8239  				// Check for eof signaling.
  8240  				if len(msg) == 0 {
  8241  					msgsQ.recycle(&mrecs)
  8242  					mset.checkInterestState()
  8243  					return nil
  8244  				}
  8245  				if _, err := mset.processCatchupMsg(msg); err == nil {
  8246  					if mrec.reply != _EMPTY_ {
  8247  						s.sendInternalMsgLocked(mrec.reply, _EMPTY_, nil, nil)
  8248  					}
  8249  				} else if isOutOfSpaceErr(err) {
  8250  					notifyLeaderStopCatchup(mrec, err)
  8251  					return err
  8252  				} else if err == NewJSInsufficientResourcesError() {
  8253  					notifyLeaderStopCatchup(mrec, err)
  8254  					if mset.js.limitsExceeded(mset.cfg.Storage) {
  8255  						s.resourcesExceededError()
  8256  					} else {
  8257  						s.Warnf("Catchup for stream '%s > %s' errored, account resources exceeded: %v", mset.account(), mset.name(), err)
  8258  					}
  8259  					msgsQ.recycle(&mrecs)
  8260  					return err
  8261  				} else {
  8262  					notifyLeaderStopCatchup(mrec, err)
  8263  					s.Warnf("Catchup for stream '%s > %s' errored, will retry: %v", mset.account(), mset.name(), err)
  8264  					msgsQ.recycle(&mrecs)
  8265  
  8266  					// Make sure we do not spin and make things worse.
  8267  					const minRetryWait = 2 * time.Second
  8268  					elapsed := time.Since(reqSendTime)
  8269  					if elapsed < minRetryWait {
  8270  						select {
  8271  						case <-s.quitCh:
  8272  							return ErrServerNotRunning
  8273  						case <-qch:
  8274  							return errCatchupStreamStopped
  8275  						case <-time.After(minRetryWait - elapsed):
  8276  						}
  8277  					}
  8278  					goto RETRY
  8279  				}
  8280  			}
  8281  			notActive.Reset(activityInterval)
  8282  			msgsQ.recycle(&mrecs)
  8283  		case <-notActive.C:
  8284  			if mrecs := msgsQ.pop(); len(mrecs) > 0 {
  8285  				mrec := mrecs[0]
  8286  				notifyLeaderStopCatchup(mrec, errCatchupStalled)
  8287  				msgsQ.recycle(&mrecs)
  8288  			}
  8289  			s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name())
  8290  			goto RETRY
  8291  		case <-s.quitCh:
  8292  			return ErrServerNotRunning
  8293  		case <-qch:
  8294  			return errCatchupStreamStopped
  8295  		case isLeader := <-lch:
  8296  			if isLeader {
  8297  				n.StepDown()
  8298  				goto RETRY
  8299  			}
  8300  		}
  8301  	}
  8302  }
  8303  
  8304  // processCatchupMsg will be called to process out of band catchup msgs from a sync request.
  8305  func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) {
  8306  	if len(msg) == 0 {
  8307  		return 0, errCatchupBadMsg
  8308  	}
  8309  	op := entryOp(msg[0])
  8310  	if op != streamMsgOp && op != compressedStreamMsgOp && op != deleteRangeOp {
  8311  		return 0, errCatchupBadMsg
  8312  	}
  8313  
  8314  	mbuf := msg[1:]
  8315  	if op == deleteRangeOp {
  8316  		dr, err := decodeDeleteRange(mbuf)
  8317  		if err != nil {
  8318  			return 0, errCatchupBadMsg
  8319  		}
  8320  		// Handle the delete range.
  8321  		// Make sure the sequences match up properly.
  8322  		mset.mu.Lock()
  8323  		if len(mset.preAcks) > 0 {
  8324  			for seq := dr.First; seq < dr.First+dr.Num; seq++ {
  8325  				mset.clearAllPreAcks(seq)
  8326  			}
  8327  		}
  8328  		if err = mset.store.SkipMsgs(dr.First, dr.Num); err != nil {
  8329  			mset.mu.Unlock()
  8330  			return 0, errCatchupWrongSeqForSkip
  8331  		}
  8332  		mset.lseq = dr.First + dr.Num - 1
  8333  		lseq := mset.lseq
  8334  		mset.mu.Unlock()
  8335  		return lseq, nil
  8336  	}
  8337  
  8338  	if op == compressedStreamMsgOp {
  8339  		var err error
  8340  		mbuf, err = s2.Decode(nil, mbuf)
  8341  		if err != nil {
  8342  			panic(err.Error())
  8343  		}
  8344  	}
  8345  
  8346  	subj, _, hdr, msg, seq, ts, err := decodeStreamMsg(mbuf)
  8347  	if err != nil {
  8348  		return 0, errCatchupBadMsg
  8349  	}
  8350  
  8351  	mset.mu.Lock()
  8352  	st := mset.cfg.Storage
  8353  	ddloaded := mset.ddloaded
  8354  	tierName := mset.tier
  8355  	replicas := mset.cfg.Replicas
  8356  
  8357  	if mset.hasAllPreAcks(seq, subj) {
  8358  		mset.clearAllPreAcks(seq)
  8359  		// Mark this to be skipped
  8360  		subj, ts = _EMPTY_, 0
  8361  	}
  8362  	mset.mu.Unlock()
  8363  
  8364  	if mset.js.limitsExceeded(st) {
  8365  		return 0, NewJSInsufficientResourcesError()
  8366  	} else if exceeded, apiErr := mset.jsa.limitsExceeded(st, tierName, replicas); apiErr != nil {
  8367  		return 0, apiErr
  8368  	} else if exceeded {
  8369  		return 0, NewJSInsufficientResourcesError()
  8370  	}
  8371  
  8372  	// Put into our store
  8373  	// Messages to be skipped have no subject or timestamp.
  8374  	// TODO(dlc) - formalize with skipMsgOp
  8375  	if subj == _EMPTY_ && ts == 0 {
  8376  		if lseq := mset.store.SkipMsg(); lseq != seq {
  8377  			return 0, errCatchupWrongSeqForSkip
  8378  		}
  8379  	} else if err := mset.store.StoreRawMsg(subj, hdr, msg, seq, ts); err != nil {
  8380  		return 0, err
  8381  	}
  8382  
  8383  	// Update our lseq.
  8384  	mset.setLastSeq(seq)
  8385  
  8386  	// Check for MsgId and if we have one here make sure to update our internal map.
  8387  	if len(hdr) > 0 {
  8388  		if msgId := getMsgId(hdr); msgId != _EMPTY_ {
  8389  			if !ddloaded {
  8390  				mset.mu.Lock()
  8391  				mset.rebuildDedupe()
  8392  				mset.mu.Unlock()
  8393  			}
  8394  			mset.storeMsgId(&ddentry{msgId, seq, ts})
  8395  		}
  8396  	}
  8397  
  8398  	return seq, nil
  8399  }
  8400  
  8401  func (mset *stream) handleClusterSyncRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  8402  	var sreq streamSyncRequest
  8403  	if err := json.Unmarshal(msg, &sreq); err != nil {
  8404  		// Log error.
  8405  		return
  8406  	}
  8407  	mset.srv.startGoRoutine(func() { mset.runCatchup(reply, &sreq) })
  8408  }
  8409  
  8410  // Lock should be held.
  8411  func (js *jetStream) offlineClusterInfo(rg *raftGroup) *ClusterInfo {
  8412  	s := js.srv
  8413  
  8414  	ci := &ClusterInfo{Name: s.ClusterName(), RaftGroup: rg.Name}
  8415  	for _, peer := range rg.Peers {
  8416  		if sir, ok := s.nodeToInfo.Load(peer); ok && sir != nil {
  8417  			si := sir.(nodeInfo)
  8418  			pi := &PeerInfo{Peer: peer, Name: si.name, Current: false, Offline: true}
  8419  			ci.Replicas = append(ci.Replicas, pi)
  8420  		}
  8421  	}
  8422  	return ci
  8423  }
  8424  
  8425  // clusterInfo will report on the status of the raft group.
  8426  func (js *jetStream) clusterInfo(rg *raftGroup) *ClusterInfo {
  8427  	if js == nil {
  8428  		return nil
  8429  	}
  8430  	js.mu.RLock()
  8431  	defer js.mu.RUnlock()
  8432  
  8433  	s := js.srv
  8434  	if rg == nil || rg.node == nil {
  8435  		return &ClusterInfo{
  8436  			Name:   s.cachedClusterName(),
  8437  			Leader: s.Name(),
  8438  		}
  8439  	}
  8440  
  8441  	n := rg.node
  8442  	ci := &ClusterInfo{
  8443  		Name:      s.cachedClusterName(),
  8444  		Leader:    s.serverNameForNode(n.GroupLeader()),
  8445  		RaftGroup: rg.Name,
  8446  	}
  8447  
  8448  	now := time.Now()
  8449  	id, peers := n.ID(), n.Peers()
  8450  
  8451  	// If we are leaderless, do not suppress putting us in the peer list.
  8452  	if ci.Leader == _EMPTY_ {
  8453  		id = _EMPTY_
  8454  	}
  8455  
  8456  	for _, rp := range peers {
  8457  		if rp.ID != id && rg.isMember(rp.ID) {
  8458  			var lastSeen time.Duration
  8459  			if now.After(rp.Last) && rp.Last.Unix() != 0 {
  8460  				lastSeen = now.Sub(rp.Last)
  8461  			}
  8462  			current := rp.Current
  8463  			if current && lastSeen > lostQuorumInterval {
  8464  				current = false
  8465  			}
  8466  			// Create a peer info with common settings if the peer has not been seen
  8467  			// yet (which can happen after the whole cluster is stopped and only some
  8468  			// of the nodes are restarted).
  8469  			pi := &PeerInfo{
  8470  				Current: current,
  8471  				Offline: true,
  8472  				Active:  lastSeen,
  8473  				Lag:     rp.Lag,
  8474  				Peer:    rp.ID,
  8475  			}
  8476  			// If node is found, complete/update the settings.
  8477  			if sir, ok := s.nodeToInfo.Load(rp.ID); ok && sir != nil {
  8478  				si := sir.(nodeInfo)
  8479  				pi.Name, pi.Offline, pi.cluster = si.name, si.offline, si.cluster
  8480  			} else {
  8481  				// If not, then add a name that indicates that the server name
  8482  				// is unknown at this time, and clear the lag since it is misleading
  8483  				// (the node may not have that much lag).
  8484  				// Note: We return now the Peer ID in PeerInfo, so the "(peerID: %s)"
  8485  				// would technically not be required, but keeping it for now.
  8486  				pi.Name, pi.Lag = fmt.Sprintf("Server name unknown at this time (peerID: %s)", rp.ID), 0
  8487  			}
  8488  			ci.Replicas = append(ci.Replicas, pi)
  8489  		}
  8490  	}
  8491  	// Order the result based on the name so that we get something consistent
  8492  	// when doing repeated stream info in the CLI, etc...
  8493  	sort.Slice(ci.Replicas, func(i, j int) bool {
  8494  		return ci.Replicas[i].Name < ci.Replicas[j].Name
  8495  	})
  8496  	return ci
  8497  }
  8498  
  8499  func (mset *stream) checkClusterInfo(ci *ClusterInfo) {
  8500  	for _, r := range ci.Replicas {
  8501  		peer := getHash(r.Name)
  8502  		if lag := mset.lagForCatchupPeer(peer); lag > 0 {
  8503  			r.Current = false
  8504  			r.Lag = lag
  8505  		}
  8506  	}
  8507  }
  8508  
  8509  // Return a list of alternates, ranked by preference order to the request, of stream mirrors.
  8510  // This allows clients to select or get more information about read replicas that could be a
  8511  // better option to connect to versus the original source.
  8512  func (js *jetStream) streamAlternates(ci *ClientInfo, stream string) []StreamAlternate {
  8513  	if js == nil {
  8514  		return nil
  8515  	}
  8516  
  8517  	js.mu.RLock()
  8518  	defer js.mu.RUnlock()
  8519  
  8520  	s, cc := js.srv, js.cluster
  8521  	// Track our domain.
  8522  	domain := s.getOpts().JetStreamDomain
  8523  
  8524  	// No clustering just return nil.
  8525  	if cc == nil {
  8526  		return nil
  8527  	}
  8528  	acc, _ := s.LookupAccount(ci.serviceAccount())
  8529  	if acc == nil {
  8530  		return nil
  8531  	}
  8532  
  8533  	// Collect our ordering first for clusters.
  8534  	weights := make(map[string]int)
  8535  	all := []string{ci.Cluster}
  8536  	all = append(all, ci.Alternates...)
  8537  
  8538  	for i := 0; i < len(all); i++ {
  8539  		weights[all[i]] = len(all) - i
  8540  	}
  8541  
  8542  	var alts []StreamAlternate
  8543  	for _, sa := range cc.streams[acc.Name] {
  8544  		// Add in ourselves and any mirrors.
  8545  		if sa.Config.Name == stream || (sa.Config.Mirror != nil && sa.Config.Mirror.Name == stream) {
  8546  			alts = append(alts, StreamAlternate{Name: sa.Config.Name, Domain: domain, Cluster: sa.Group.Cluster})
  8547  		}
  8548  	}
  8549  	// If just us don't fill in.
  8550  	if len(alts) == 1 {
  8551  		return nil
  8552  	}
  8553  
  8554  	// Sort based on our weights that originate from the request itself.
  8555  	sort.Slice(alts, func(i, j int) bool {
  8556  		return weights[alts[i].Cluster] > weights[alts[j].Cluster]
  8557  	})
  8558  
  8559  	return alts
  8560  }
  8561  
  8562  // Internal request for stream info, this is coming on the wire so do not block here.
  8563  func (mset *stream) handleClusterStreamInfoRequest(_ *subscription, c *client, _ *Account, subject, reply string, _ []byte) {
  8564  	go mset.processClusterStreamInfoRequest(reply)
  8565  }
  8566  
  8567  func (mset *stream) processClusterStreamInfoRequest(reply string) {
  8568  	mset.mu.RLock()
  8569  	sysc, js, sa, config := mset.sysc, mset.srv.js.Load(), mset.sa, mset.cfg
  8570  	isLeader := mset.isLeader()
  8571  	mset.mu.RUnlock()
  8572  
  8573  	// By design all members will receive this. Normally we only want the leader answering.
  8574  	// But if we have stalled and lost quorom all can respond.
  8575  	if sa != nil && !js.isGroupLeaderless(sa.Group) && !isLeader {
  8576  		return
  8577  	}
  8578  
  8579  	// If we are not the leader let someone else possibly respond first.
  8580  	if !isLeader {
  8581  		time.Sleep(500 * time.Millisecond)
  8582  	}
  8583  
  8584  	si := &StreamInfo{
  8585  		Created:   mset.createdTime(),
  8586  		State:     mset.state(),
  8587  		Config:    config,
  8588  		Cluster:   js.clusterInfo(mset.raftGroup()),
  8589  		Sources:   mset.sourcesInfo(),
  8590  		Mirror:    mset.mirrorInfo(),
  8591  		TimeStamp: time.Now().UTC(),
  8592  	}
  8593  
  8594  	// Check for out of band catchups.
  8595  	if mset.hasCatchupPeers() {
  8596  		mset.checkClusterInfo(si.Cluster)
  8597  	}
  8598  
  8599  	sysc.sendInternalMsg(reply, _EMPTY_, nil, si)
  8600  }
  8601  
  8602  // 64MB for now, for the total server. This is max we will blast out if asked to
  8603  // do so to another server for purposes of catchups.
  8604  // This number should be ok on 1Gbit interface.
  8605  const defaultMaxTotalCatchupOutBytes = int64(64 * 1024 * 1024)
  8606  
  8607  // Current total outstanding catchup bytes.
  8608  func (s *Server) gcbTotal() int64 {
  8609  	s.gcbMu.RLock()
  8610  	defer s.gcbMu.RUnlock()
  8611  	return s.gcbOut
  8612  }
  8613  
  8614  // Returns true if Current total outstanding catchup bytes is below
  8615  // the maximum configured.
  8616  func (s *Server) gcbBelowMax() bool {
  8617  	s.gcbMu.RLock()
  8618  	defer s.gcbMu.RUnlock()
  8619  	return s.gcbOut <= s.gcbOutMax
  8620  }
  8621  
  8622  // Adds `sz` to the server's total outstanding catchup bytes and to `localsz`
  8623  // under the gcbMu lock. The `localsz` points to the local outstanding catchup
  8624  // bytes of the runCatchup go routine of a given stream.
  8625  func (s *Server) gcbAdd(localsz *int64, sz int64) {
  8626  	s.gcbMu.Lock()
  8627  	atomic.AddInt64(localsz, sz)
  8628  	s.gcbOut += sz
  8629  	if s.gcbOut >= s.gcbOutMax && s.gcbKick == nil {
  8630  		s.gcbKick = make(chan struct{})
  8631  	}
  8632  	s.gcbMu.Unlock()
  8633  }
  8634  
  8635  // Removes `sz` from the server's total outstanding catchup bytes and from
  8636  // `localsz`, but only if `localsz` is non 0, which would signal that gcSubLast
  8637  // has already been invoked. See that function for details.
  8638  // Must be invoked under the gcbMu lock.
  8639  func (s *Server) gcbSubLocked(localsz *int64, sz int64) {
  8640  	if atomic.LoadInt64(localsz) == 0 {
  8641  		return
  8642  	}
  8643  	atomic.AddInt64(localsz, -sz)
  8644  	s.gcbOut -= sz
  8645  	if s.gcbKick != nil && s.gcbOut < s.gcbOutMax {
  8646  		close(s.gcbKick)
  8647  		s.gcbKick = nil
  8648  	}
  8649  }
  8650  
  8651  // Locked version of gcbSubLocked()
  8652  func (s *Server) gcbSub(localsz *int64, sz int64) {
  8653  	s.gcbMu.Lock()
  8654  	s.gcbSubLocked(localsz, sz)
  8655  	s.gcbMu.Unlock()
  8656  }
  8657  
  8658  // Similar to gcbSub() but reset `localsz` to 0 at the end under the gcbMu lock.
  8659  // This will signal further calls to gcbSub() for this `localsz` pointer that
  8660  // nothing should be done because runCatchup() has exited and any remaining
  8661  // outstanding bytes value has already been decremented.
  8662  func (s *Server) gcbSubLast(localsz *int64) {
  8663  	s.gcbMu.Lock()
  8664  	s.gcbSubLocked(localsz, *localsz)
  8665  	*localsz = 0
  8666  	s.gcbMu.Unlock()
  8667  }
  8668  
  8669  // Returns our kick chan, or nil if it does not exist.
  8670  func (s *Server) cbKickChan() <-chan struct{} {
  8671  	s.gcbMu.RLock()
  8672  	defer s.gcbMu.RUnlock()
  8673  	return s.gcbKick
  8674  }
  8675  
  8676  func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) {
  8677  	s := mset.srv
  8678  	defer s.grWG.Done()
  8679  
  8680  	const maxOutBytes = int64(64 * 1024 * 1024) // 64MB for now, these are all internal, from server to server
  8681  	const maxOutMsgs = int32(256 * 1024)        // 256k in case we have lots of small messages or skip msgs.
  8682  	outb := int64(0)
  8683  	outm := int32(0)
  8684  
  8685  	// On abnormal exit make sure to update global total.
  8686  	defer s.gcbSubLast(&outb)
  8687  
  8688  	// Flow control processing.
  8689  	ackReplySize := func(subj string) int64 {
  8690  		if li := strings.LastIndexByte(subj, btsep); li > 0 && li < len(subj) {
  8691  			return parseAckReplyNum(subj[li+1:])
  8692  		}
  8693  		return 0
  8694  	}
  8695  
  8696  	nextBatchC := make(chan struct{}, 1)
  8697  	nextBatchC <- struct{}{}
  8698  	remoteQuitCh := make(chan struct{})
  8699  
  8700  	const activityInterval = 30 * time.Second
  8701  	notActive := time.NewTimer(activityInterval)
  8702  	defer notActive.Stop()
  8703  
  8704  	// Setup ackReply for flow control.
  8705  	ackReply := syncAckSubject()
  8706  	ackSub, _ := s.sysSubscribe(ackReply, func(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  8707  		if len(msg) > 0 {
  8708  			s.Warnf("Catchup for stream '%s > %s' was aborted on the remote due to: %q",
  8709  				mset.account(), mset.name(), msg)
  8710  			s.sysUnsubscribe(sub)
  8711  			close(remoteQuitCh)
  8712  			return
  8713  		}
  8714  		sz := ackReplySize(subject)
  8715  		s.gcbSub(&outb, sz)
  8716  		atomic.AddInt32(&outm, -1)
  8717  		mset.updateCatchupPeer(sreq.Peer)
  8718  		// Kick ourselves and anyone else who might have stalled on global state.
  8719  		select {
  8720  		case nextBatchC <- struct{}{}:
  8721  			// Reset our activity
  8722  			notActive.Reset(activityInterval)
  8723  		default:
  8724  		}
  8725  	})
  8726  	defer s.sysUnsubscribe(ackSub)
  8727  	ackReplyT := strings.ReplaceAll(ackReply, ".*", ".%d")
  8728  
  8729  	// Grab our state.
  8730  	var state StreamState
  8731  	mset.mu.RLock()
  8732  	mset.store.FastState(&state)
  8733  	mset.mu.RUnlock()
  8734  
  8735  	// Reset notion of first if this request wants sequences before our starting sequence
  8736  	// and we would have nothing to send. If we have partial messages still need to send skips for those.
  8737  	// We will keep sreq's first sequence to not create sequence mismatches on the follower, but we extend the last to our current state.
  8738  	if sreq.FirstSeq < state.FirstSeq && state.FirstSeq > sreq.LastSeq {
  8739  		s.Debugf("Catchup for stream '%s > %s' resetting request first sequence from %d to %d",
  8740  			mset.account(), mset.name(), sreq.FirstSeq, state.FirstSeq)
  8741  		if state.LastSeq > sreq.LastSeq {
  8742  			sreq.LastSeq = state.LastSeq
  8743  		}
  8744  	}
  8745  
  8746  	// Setup sequences to walk through.
  8747  	seq, last := sreq.FirstSeq, sreq.LastSeq
  8748  	mset.setCatchupPeer(sreq.Peer, last-seq)
  8749  
  8750  	// Check if we can compress during this.
  8751  	compressOk := mset.compressAllowed()
  8752  
  8753  	var spb int
  8754  	const minWait = 5 * time.Second
  8755  
  8756  	sendNextBatchAndContinue := func(qch chan struct{}) bool {
  8757  		// Check if we know we will not enter the loop because we are done.
  8758  		if seq > last {
  8759  			s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name())
  8760  			// EOF
  8761  			s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil)
  8762  			return false
  8763  		}
  8764  
  8765  		// If we already sent a batch, we will try to make sure we can at least send a minimum
  8766  		// batch before sending the next batch.
  8767  		if spb > 0 {
  8768  			// Wait til we can send at least 4k
  8769  			const minBatchWait = int32(4 * 1024)
  8770  			mw := time.NewTimer(minWait)
  8771  			for done := false; !done; {
  8772  				select {
  8773  				case <-nextBatchC:
  8774  					done = maxOutMsgs-atomic.LoadInt32(&outm) > minBatchWait
  8775  					if !done {
  8776  						// Wait for a small bit.
  8777  						time.Sleep(50 * time.Millisecond)
  8778  					} else {
  8779  						// GC friendly.
  8780  						mw.Stop()
  8781  					}
  8782  				case <-mw.C:
  8783  					done = true
  8784  				case <-s.quitCh:
  8785  					return false
  8786  				case <-qch:
  8787  					return false
  8788  				case <-remoteQuitCh:
  8789  					return false
  8790  				}
  8791  			}
  8792  			spb = 0
  8793  		}
  8794  
  8795  		// Send an encoded msg.
  8796  		sendEM := func(em []byte) {
  8797  			// Place size in reply subject for flow control.
  8798  			l := int64(len(em))
  8799  			reply := fmt.Sprintf(ackReplyT, l)
  8800  			s.gcbAdd(&outb, l)
  8801  			atomic.AddInt32(&outm, 1)
  8802  			s.sendInternalMsgLocked(sendSubject, reply, nil, em)
  8803  			spb++
  8804  		}
  8805  
  8806  		// If we support gap markers.
  8807  		var dr DeleteRange
  8808  		drOk := sreq.DeleteRangesOk
  8809  
  8810  		// Will send our delete range.
  8811  		// Should already be checked for being valid.
  8812  		sendDR := func() {
  8813  			if dr.Num == 1 {
  8814  				// Send like a normal skip msg.
  8815  				sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, dr.First, 0))
  8816  			} else {
  8817  				// We have a run, send a gap record. We send these without reply or tracking.
  8818  				s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, encodeDeleteRange(&dr))
  8819  				// Clear out the pending for catchup.
  8820  				mset.decrementCatchupPeer(sreq.Peer, dr.Num)
  8821  			}
  8822  			// Reset always.
  8823  			dr.First, dr.Num = 0, 0
  8824  		}
  8825  
  8826  		var smv StoreMsg
  8827  		for ; seq <= last && atomic.LoadInt64(&outb) <= maxOutBytes && atomic.LoadInt32(&outm) <= maxOutMsgs && s.gcbBelowMax(); seq++ {
  8828  			sm, err := mset.store.LoadMsg(seq, &smv)
  8829  			// if this is not a deleted msg, bail out.
  8830  			if err != nil && err != ErrStoreMsgNotFound && err != errDeletedMsg {
  8831  				if err == ErrStoreEOF {
  8832  					var state StreamState
  8833  					mset.store.FastState(&state)
  8834  					if seq > state.LastSeq {
  8835  						// The snapshot has a larger last sequence then we have. This could be due to a truncation
  8836  						// when trying to recover after corruption, still not 100% sure. Could be off by 1 too somehow,
  8837  						// but tested a ton of those with no success.
  8838  						s.Warnf("Catchup for stream '%s > %s' completed, but requested sequence %d was larger then current state: %+v",
  8839  							mset.account(), mset.name(), seq, state)
  8840  						// Try our best to redo our invalidated snapshot as well.
  8841  						if n := mset.raftNode(); n != nil {
  8842  							n.InstallSnapshot(mset.stateSnapshot())
  8843  						}
  8844  						// Signal EOF
  8845  						s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil)
  8846  						return false
  8847  					}
  8848  				}
  8849  				s.Warnf("Error loading message for catchup '%s > %s': %v", mset.account(), mset.name(), err)
  8850  				return false
  8851  			}
  8852  
  8853  			if sm != nil {
  8854  				// If we allow gap markers check if we have one pending.
  8855  				if drOk && dr.First > 0 {
  8856  					sendDR()
  8857  				}
  8858  				// Send the normal message now.
  8859  				sendEM(encodeStreamMsgAllowCompress(sm.subj, _EMPTY_, sm.hdr, sm.msg, sm.seq, sm.ts, compressOk))
  8860  			} else {
  8861  				if drOk {
  8862  					if dr.First == 0 {
  8863  						dr.First, dr.Num = seq, 1
  8864  					} else {
  8865  						dr.Num++
  8866  					}
  8867  				} else {
  8868  					// Skip record for deleted msg.
  8869  					sendEM(encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq, 0))
  8870  				}
  8871  			}
  8872  
  8873  			// Check if we are done.
  8874  			if seq == last {
  8875  				// Need to see if we have a pending delete range.
  8876  				if drOk && dr.First > 0 {
  8877  					sendDR()
  8878  				}
  8879  				// Check for a condition where our state's first is now past the last that we could have sent.
  8880  				// If so reset last and continue sending.
  8881  				var state StreamState
  8882  				mset.mu.RLock()
  8883  				mset.store.FastState(&state)
  8884  				mset.mu.RUnlock()
  8885  				if last < state.FirstSeq {
  8886  					last = state.LastSeq
  8887  				}
  8888  				// Recheck our exit condition.
  8889  				if seq == last {
  8890  					s.Noticef("Catchup for stream '%s > %s' complete", mset.account(), mset.name())
  8891  					// EOF
  8892  					s.sendInternalMsgLocked(sendSubject, _EMPTY_, nil, nil)
  8893  					return false
  8894  				}
  8895  			}
  8896  			select {
  8897  			case <-remoteQuitCh:
  8898  				return false
  8899  			default:
  8900  			}
  8901  		}
  8902  		if drOk && dr.First > 0 {
  8903  			sendDR()
  8904  		}
  8905  
  8906  		return true
  8907  	}
  8908  
  8909  	// Check is this stream got closed.
  8910  	mset.mu.RLock()
  8911  	qch := mset.qch
  8912  	mset.mu.RUnlock()
  8913  	if qch == nil {
  8914  		return
  8915  	}
  8916  
  8917  	// Run as long as we are still active and need catchup.
  8918  	// FIXME(dlc) - Purge event? Stream delete?
  8919  	for {
  8920  		// Get this each time, will be non-nil if globally blocked and we will close to wake everyone up.
  8921  		cbKick := s.cbKickChan()
  8922  
  8923  		select {
  8924  		case <-s.quitCh:
  8925  			return
  8926  		case <-qch:
  8927  			return
  8928  		case <-remoteQuitCh:
  8929  			mset.clearCatchupPeer(sreq.Peer)
  8930  			return
  8931  		case <-notActive.C:
  8932  			s.Warnf("Catchup for stream '%s > %s' stalled", mset.account(), mset.name())
  8933  			mset.clearCatchupPeer(sreq.Peer)
  8934  			return
  8935  		case <-nextBatchC:
  8936  			if !sendNextBatchAndContinue(qch) {
  8937  				mset.clearCatchupPeer(sreq.Peer)
  8938  				return
  8939  			}
  8940  		case <-cbKick:
  8941  			if !sendNextBatchAndContinue(qch) {
  8942  				mset.clearCatchupPeer(sreq.Peer)
  8943  				return
  8944  			}
  8945  		}
  8946  	}
  8947  }
  8948  
  8949  const jscAllSubj = "$JSC.>"
  8950  
  8951  func syncSubjForStream() string {
  8952  	return syncSubject("$JSC.SYNC")
  8953  }
  8954  
  8955  func syncReplySubject() string {
  8956  	return syncSubject("$JSC.R")
  8957  }
  8958  
  8959  func infoReplySubject() string {
  8960  	return syncSubject("$JSC.R")
  8961  }
  8962  
  8963  func syncAckSubject() string {
  8964  	return syncSubject("$JSC.ACK") + ".*"
  8965  }
  8966  
  8967  func syncSubject(pre string) string {
  8968  	var sb strings.Builder
  8969  	sb.WriteString(pre)
  8970  	sb.WriteByte(btsep)
  8971  
  8972  	var b [replySuffixLen]byte
  8973  	rn := rand.Int63()
  8974  	for i, l := 0, rn; i < len(b); i++ {
  8975  		b[i] = digits[l%base]
  8976  		l /= base
  8977  	}
  8978  
  8979  	sb.Write(b[:])
  8980  	return sb.String()
  8981  }
  8982  
  8983  const (
  8984  	clusterStreamInfoT   = "$JSC.SI.%s.%s"
  8985  	clusterConsumerInfoT = "$JSC.CI.%s.%s.%s"
  8986  	jsaUpdatesSubT       = "$JSC.ARU.%s.*"
  8987  	jsaUpdatesPubT       = "$JSC.ARU.%s.%s"
  8988  )