github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/filestore.go (about)

     1  // Copyright 2019-2024 The NATS Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package server
    15  
    16  import (
    17  	"archive/tar"
    18  	"bytes"
    19  	"crypto/aes"
    20  	"crypto/cipher"
    21  	"crypto/rand"
    22  	"crypto/sha256"
    23  	"encoding/binary"
    24  	"encoding/hex"
    25  	"encoding/json"
    26  	"errors"
    27  	"fmt"
    28  	"hash"
    29  	"io"
    30  	"math"
    31  	"net"
    32  	"os"
    33  	"path/filepath"
    34  	"sort"
    35  	"strings"
    36  	"sync"
    37  	"sync/atomic"
    38  	"time"
    39  
    40  	"github.com/klauspost/compress/s2"
    41  	"github.com/minio/highwayhash"
    42  	"github.com/nats-io/nats-server/v2/server/avl"
    43  	"github.com/nats-io/nats-server/v2/server/stree"
    44  	"golang.org/x/crypto/chacha20"
    45  	"golang.org/x/crypto/chacha20poly1305"
    46  )
    47  
    48  type FileStoreConfig struct {
    49  	// Where the parent directory for all storage will be located.
    50  	StoreDir string
    51  	// BlockSize is the file block size. This also represents the maximum overhead size.
    52  	BlockSize uint64
    53  	// CacheExpire is how long with no activity until we expire the cache.
    54  	CacheExpire time.Duration
    55  	// SyncInterval is how often we sync to disk in the background.
    56  	SyncInterval time.Duration
    57  	// SyncAlways is when the stream should sync all data writes.
    58  	SyncAlways bool
    59  	// AsyncFlush allows async flush to batch write operations.
    60  	AsyncFlush bool
    61  	// Cipher is the cipher to use when encrypting.
    62  	Cipher StoreCipher
    63  	// Compression is the algorithm to use when compressing.
    64  	Compression StoreCompression
    65  
    66  	// Internal reference to our server.
    67  	srv *Server
    68  }
    69  
    70  // FileStreamInfo allows us to remember created time.
    71  type FileStreamInfo struct {
    72  	Created time.Time
    73  	StreamConfig
    74  }
    75  
    76  type StoreCipher int
    77  
    78  const (
    79  	ChaCha StoreCipher = iota
    80  	AES
    81  	NoCipher
    82  )
    83  
    84  func (cipher StoreCipher) String() string {
    85  	switch cipher {
    86  	case ChaCha:
    87  		return "ChaCha20-Poly1305"
    88  	case AES:
    89  		return "AES-GCM"
    90  	case NoCipher:
    91  		return "None"
    92  	default:
    93  		return "Unknown StoreCipher"
    94  	}
    95  }
    96  
    97  type StoreCompression uint8
    98  
    99  const (
   100  	NoCompression StoreCompression = iota
   101  	S2Compression
   102  )
   103  
   104  func (alg StoreCompression) String() string {
   105  	switch alg {
   106  	case NoCompression:
   107  		return "None"
   108  	case S2Compression:
   109  		return "S2"
   110  	default:
   111  		return "Unknown StoreCompression"
   112  	}
   113  }
   114  
   115  func (alg StoreCompression) MarshalJSON() ([]byte, error) {
   116  	var str string
   117  	switch alg {
   118  	case S2Compression:
   119  		str = "s2"
   120  	case NoCompression:
   121  		str = "none"
   122  	default:
   123  		return nil, fmt.Errorf("unknown compression algorithm")
   124  	}
   125  	return json.Marshal(str)
   126  }
   127  
   128  func (alg *StoreCompression) UnmarshalJSON(b []byte) error {
   129  	var str string
   130  	if err := json.Unmarshal(b, &str); err != nil {
   131  		return err
   132  	}
   133  	switch str {
   134  	case "s2":
   135  		*alg = S2Compression
   136  	case "none":
   137  		*alg = NoCompression
   138  	default:
   139  		return fmt.Errorf("unknown compression algorithm")
   140  	}
   141  	return nil
   142  }
   143  
   144  // File ConsumerInfo is used for creating consumer stores.
   145  type FileConsumerInfo struct {
   146  	Created time.Time
   147  	Name    string
   148  	ConsumerConfig
   149  }
   150  
   151  // Default file and directory permissions.
   152  const (
   153  	defaultDirPerms  = os.FileMode(0750)
   154  	defaultFilePerms = os.FileMode(0640)
   155  )
   156  
   157  type psi struct {
   158  	total uint64
   159  	fblk  uint32
   160  	lblk  uint32
   161  }
   162  
   163  type fileStore struct {
   164  	srv         *Server
   165  	mu          sync.RWMutex
   166  	state       StreamState
   167  	tombs       []uint64
   168  	ld          *LostStreamData
   169  	scb         StorageUpdateHandler
   170  	ageChk      *time.Timer
   171  	syncTmr     *time.Timer
   172  	cfg         FileStreamInfo
   173  	fcfg        FileStoreConfig
   174  	prf         keyGen
   175  	oldprf      keyGen
   176  	aek         cipher.AEAD
   177  	lmb         *msgBlock
   178  	blks        []*msgBlock
   179  	bim         map[uint32]*msgBlock
   180  	psim        *stree.SubjectTree[psi]
   181  	tsl         int
   182  	adml        int
   183  	hh          hash.Hash64
   184  	qch         chan struct{}
   185  	fsld        chan struct{}
   186  	cmu         sync.RWMutex
   187  	cfs         []ConsumerStore
   188  	sips        int
   189  	dirty       int
   190  	closing     bool
   191  	closed      bool
   192  	fip         bool
   193  	receivedAny bool
   194  }
   195  
   196  // Represents a message store block and its data.
   197  type msgBlock struct {
   198  	// Here for 32bit systems and atomic.
   199  	first      msgId
   200  	last       msgId
   201  	mu         sync.RWMutex
   202  	fs         *fileStore
   203  	aek        cipher.AEAD
   204  	bek        cipher.Stream
   205  	seed       []byte
   206  	nonce      []byte
   207  	mfn        string
   208  	mfd        *os.File
   209  	cmp        StoreCompression // Effective compression at the time of loading the block
   210  	liwsz      int64
   211  	index      uint32
   212  	bytes      uint64 // User visible bytes count.
   213  	rbytes     uint64 // Total bytes (raw) including deleted. Used for rolling to new blk.
   214  	msgs       uint64 // User visible message count.
   215  	fss        map[string]*SimpleState
   216  	kfn        string
   217  	lwts       int64
   218  	llts       int64
   219  	lrts       int64
   220  	llseq      uint64
   221  	hh         hash.Hash64
   222  	cache      *cache
   223  	cloads     uint64
   224  	cexp       time.Duration
   225  	ctmr       *time.Timer
   226  	werr       error
   227  	dmap       avl.SequenceSet
   228  	fch        chan struct{}
   229  	qch        chan struct{}
   230  	lchk       [8]byte
   231  	loading    bool
   232  	flusher    bool
   233  	noTrack    bool
   234  	needSync   bool
   235  	syncAlways bool
   236  	closed     bool
   237  
   238  	// Used to mock write failures.
   239  	mockWriteErr bool
   240  }
   241  
   242  // Write through caching layer that is also used on loading messages.
   243  type cache struct {
   244  	buf  []byte
   245  	off  int
   246  	wp   int
   247  	idx  []uint32
   248  	lrl  uint32
   249  	fseq uint64
   250  	nra  bool
   251  }
   252  
   253  type msgId struct {
   254  	seq uint64
   255  	ts  int64
   256  }
   257  
   258  const (
   259  	// Magic is used to identify the file store files.
   260  	magic = uint8(22)
   261  	// Version
   262  	version = uint8(1)
   263  	// New IndexInfo Version
   264  	newVersion = uint8(2)
   265  	// hdrLen
   266  	hdrLen = 2
   267  	// This is where we keep the streams.
   268  	streamsDir = "streams"
   269  	// This is where we keep the message store blocks.
   270  	msgDir = "msgs"
   271  	// This is where we temporarily move the messages dir.
   272  	purgeDir = "__msgs__"
   273  	// used to scan blk file names.
   274  	blkScan = "%d.blk"
   275  	// used for compacted blocks that are staged.
   276  	newScan = "%d.new"
   277  	// used to scan index file names.
   278  	indexScan = "%d.idx"
   279  	// used to store our block encryption key.
   280  	keyScan = "%d.key"
   281  	// to look for orphans
   282  	keyScanAll = "*.key"
   283  	// This is where we keep state on consumers.
   284  	consumerDir = "obs"
   285  	// Index file for a consumer.
   286  	consumerState = "o.dat"
   287  	// The suffix that will be given to a new temporary block during compression.
   288  	compressTmpSuffix = ".tmp"
   289  	// This is where we keep state on templates.
   290  	tmplsDir = "templates"
   291  	// Maximum size of a write buffer we may consider for re-use.
   292  	maxBufReuse = 2 * 1024 * 1024
   293  	// default cache buffer expiration
   294  	defaultCacheBufferExpiration = 2 * time.Second
   295  	// default sync interval
   296  	defaultSyncInterval = 2 * time.Minute
   297  	// default idle timeout to close FDs.
   298  	closeFDsIdle = 30 * time.Second
   299  	// coalesceMinimum
   300  	coalesceMinimum = 16 * 1024
   301  	// maxFlushWait is maximum we will wait to gather messages to flush.
   302  	maxFlushWait = 8 * time.Millisecond
   303  
   304  	// Metafiles for streams and consumers.
   305  	JetStreamMetaFile    = "meta.inf"
   306  	JetStreamMetaFileSum = "meta.sum"
   307  	JetStreamMetaFileKey = "meta.key"
   308  
   309  	// This is the full snapshotted state for the stream.
   310  	streamStreamStateFile = "index.db"
   311  
   312  	// AEK key sizes
   313  	minMetaKeySize = 64
   314  	minBlkKeySize  = 64
   315  
   316  	// Default stream block size.
   317  	defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB
   318  	// Default for workqueue or interest based.
   319  	defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB
   320  	// For smaller reuse buffers. Usually being generated during contention on the lead write buffer.
   321  	// E.g. mirrors/sources etc.
   322  	defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB
   323  	// Maximum size for the encrypted head block.
   324  	maximumEncryptedBlockSize = 2 * 1024 * 1024 // 2MB
   325  	// Default for KV based
   326  	defaultKVBlockSize = defaultMediumBlockSize
   327  	// max block size for now.
   328  	maxBlockSize = defaultLargeBlockSize
   329  	// Compact minimum threshold.
   330  	compactMinimum = 2 * 1024 * 1024 // 2MB
   331  	// FileStoreMinBlkSize is minimum size we will do for a blk size.
   332  	FileStoreMinBlkSize = 32 * 1000 // 32kib
   333  	// FileStoreMaxBlkSize is maximum size we will do for a blk size.
   334  	FileStoreMaxBlkSize = maxBlockSize
   335  	// Check for bad record length value due to corrupt data.
   336  	rlBadThresh = 32 * 1024 * 1024
   337  	// Checksum size for hash for msg records.
   338  	recordHashSize = 8
   339  )
   340  
   341  func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) {
   342  	return newFileStoreWithCreated(fcfg, cfg, time.Now().UTC(), nil, nil)
   343  }
   344  
   345  func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created time.Time, prf, oldprf keyGen) (*fileStore, error) {
   346  	if cfg.Name == _EMPTY_ {
   347  		return nil, fmt.Errorf("name required")
   348  	}
   349  	if cfg.Storage != FileStorage {
   350  		return nil, fmt.Errorf("fileStore requires file storage type in config")
   351  	}
   352  	// Default values.
   353  	if fcfg.BlockSize == 0 {
   354  		fcfg.BlockSize = dynBlkSize(cfg.Retention, cfg.MaxBytes, prf != nil)
   355  	}
   356  	if fcfg.BlockSize > maxBlockSize {
   357  		return nil, fmt.Errorf("filestore max block size is %s", friendlyBytes(maxBlockSize))
   358  	}
   359  	if fcfg.CacheExpire == 0 {
   360  		fcfg.CacheExpire = defaultCacheBufferExpiration
   361  	}
   362  	if fcfg.SyncInterval == 0 {
   363  		fcfg.SyncInterval = defaultSyncInterval
   364  	}
   365  
   366  	// Check the directory
   367  	if stat, err := os.Stat(fcfg.StoreDir); os.IsNotExist(err) {
   368  		if err := os.MkdirAll(fcfg.StoreDir, defaultDirPerms); err != nil {
   369  			return nil, fmt.Errorf("could not create storage directory - %v", err)
   370  		}
   371  	} else if stat == nil || !stat.IsDir() {
   372  		return nil, fmt.Errorf("storage directory is not a directory")
   373  	}
   374  	tmpfile, err := os.CreateTemp(fcfg.StoreDir, "_test_")
   375  	if err != nil {
   376  		return nil, fmt.Errorf("storage directory is not writable")
   377  	}
   378  
   379  	tmpfile.Close()
   380  	<-dios
   381  	os.Remove(tmpfile.Name())
   382  	dios <- struct{}{}
   383  
   384  	fs := &fileStore{
   385  		fcfg:   fcfg,
   386  		psim:   stree.NewSubjectTree[psi](),
   387  		bim:    make(map[uint32]*msgBlock),
   388  		cfg:    FileStreamInfo{Created: created, StreamConfig: cfg},
   389  		prf:    prf,
   390  		oldprf: oldprf,
   391  		qch:    make(chan struct{}),
   392  		fsld:   make(chan struct{}),
   393  		srv:    fcfg.srv,
   394  	}
   395  
   396  	// Set flush in place to AsyncFlush which by default is false.
   397  	fs.fip = !fcfg.AsyncFlush
   398  
   399  	// Check if this is a new setup.
   400  	mdir := filepath.Join(fcfg.StoreDir, msgDir)
   401  	odir := filepath.Join(fcfg.StoreDir, consumerDir)
   402  	if err := os.MkdirAll(mdir, defaultDirPerms); err != nil {
   403  		return nil, fmt.Errorf("could not create message storage directory - %v", err)
   404  	}
   405  	if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
   406  		return nil, fmt.Errorf("could not create consumer storage directory - %v", err)
   407  	}
   408  
   409  	// Create highway hash for message blocks. Use sha256 of directory as key.
   410  	key := sha256.Sum256([]byte(cfg.Name))
   411  	fs.hh, err = highwayhash.New64(key[:])
   412  	if err != nil {
   413  		return nil, fmt.Errorf("could not create hash: %v", err)
   414  	}
   415  
   416  	keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
   417  	// Make sure we do not have an encrypted store underneath of us but no main key.
   418  	if fs.prf == nil {
   419  		if _, err := os.Stat(keyFile); err == nil {
   420  			return nil, errNoMainKey
   421  		}
   422  	}
   423  
   424  	// Attempt to recover our state.
   425  	err = fs.recoverFullState()
   426  	if err != nil {
   427  		// Hold onto state
   428  		prior := fs.state
   429  		// Reset anything that could have been set from above.
   430  		fs.state = StreamState{}
   431  		fs.psim, fs.tsl = fs.psim.Empty(), 0
   432  		fs.bim = make(map[uint32]*msgBlock)
   433  		fs.blks = nil
   434  		fs.tombs = nil
   435  
   436  		// Recover our message state the old way
   437  		if err := fs.recoverMsgs(); err != nil {
   438  			return nil, err
   439  		}
   440  
   441  		// Check if our prior state remembers a last sequence past where we can see.
   442  		if fs.ld != nil && prior.LastSeq > fs.state.LastSeq {
   443  			fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime
   444  			if lmb, err := fs.newMsgBlockForWrite(); err == nil {
   445  				lmb.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano())
   446  			} else {
   447  				return nil, err
   448  			}
   449  		}
   450  		// Since we recovered here, make sure to kick ourselves to write out our stream state.
   451  		fs.dirty++
   452  	}
   453  
   454  	// Also make sure we get rid of old idx and fss files on return.
   455  	// Do this in separate go routine vs inline and at end of processing.
   456  	defer func() {
   457  		go fs.cleanupOldMeta()
   458  	}()
   459  
   460  	// Lock while do enforcements and removals.
   461  	fs.mu.Lock()
   462  
   463  	// Check if we have any left over tombstones to process.
   464  	if len(fs.tombs) > 0 {
   465  		for _, seq := range fs.tombs {
   466  			fs.removeMsg(seq, false, true, false)
   467  			fs.removeFromLostData(seq)
   468  		}
   469  		// Not needed after this phase.
   470  		fs.tombs = nil
   471  	}
   472  
   473  	// Limits checks and enforcement.
   474  	fs.enforceMsgLimit()
   475  	fs.enforceBytesLimit()
   476  
   477  	// Do age checks too, make sure to call in place.
   478  	if fs.cfg.MaxAge != 0 {
   479  		fs.expireMsgsOnRecover()
   480  		fs.startAgeChk()
   481  	}
   482  
   483  	// If we have max msgs per subject make sure the is also enforced.
   484  	if fs.cfg.MaxMsgsPer > 0 {
   485  		fs.enforceMsgPerSubjectLimit(false)
   486  	}
   487  
   488  	// Grab first sequence for check below while we have lock.
   489  	firstSeq := fs.state.FirstSeq
   490  	fs.mu.Unlock()
   491  
   492  	// If the stream has an initial sequence number then make sure we
   493  	// have purged up until that point. We will do this only if the
   494  	// recovered first sequence number is before our configured first
   495  	// sequence. Need to do this locked as by now the age check timer
   496  	// has started.
   497  	if cfg.FirstSeq > 0 && firstSeq <= cfg.FirstSeq {
   498  		if _, err := fs.purge(cfg.FirstSeq); err != nil {
   499  			return nil, err
   500  		}
   501  	}
   502  
   503  	// Write our meta data if it does not exist or is zero'd out.
   504  	meta := filepath.Join(fcfg.StoreDir, JetStreamMetaFile)
   505  	fi, err := os.Stat(meta)
   506  	if err != nil && os.IsNotExist(err) || fi != nil && fi.Size() == 0 {
   507  		if err := fs.writeStreamMeta(); err != nil {
   508  			return nil, err
   509  		}
   510  	}
   511  
   512  	// If we expect to be encrypted check that what we are restoring is not plaintext.
   513  	// This can happen on snapshot restores or conversions.
   514  	if fs.prf != nil {
   515  		if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
   516  			if err := fs.writeStreamMeta(); err != nil {
   517  				return nil, err
   518  			}
   519  		}
   520  	}
   521  
   522  	// Setup our sync timer.
   523  	fs.setSyncTimer()
   524  
   525  	// Spin up the go routine that will write out our full state stream index.
   526  	go fs.flushStreamStateLoop(fs.qch, fs.fsld)
   527  
   528  	return fs, nil
   529  }
   530  
   531  // Lock all existing message blocks.
   532  // Lock held on entry.
   533  func (fs *fileStore) lockAllMsgBlocks() {
   534  	for _, mb := range fs.blks {
   535  		mb.mu.Lock()
   536  	}
   537  }
   538  
   539  // Unlock all existing message blocks.
   540  // Lock held on entry.
   541  func (fs *fileStore) unlockAllMsgBlocks() {
   542  	for _, mb := range fs.blks {
   543  		mb.mu.Unlock()
   544  	}
   545  }
   546  
   547  func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error {
   548  	if fs.isClosed() {
   549  		return ErrStoreClosed
   550  	}
   551  	if cfg.Name == _EMPTY_ {
   552  		return fmt.Errorf("name required")
   553  	}
   554  	if cfg.Storage != FileStorage {
   555  		return fmt.Errorf("fileStore requires file storage type in config")
   556  	}
   557  
   558  	fs.mu.Lock()
   559  	new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg}
   560  	old_cfg := fs.cfg
   561  	// The reference story has changed here, so this full msg block lock
   562  	// may not be needed.
   563  	fs.lockAllMsgBlocks()
   564  	fs.cfg = new_cfg
   565  	fs.unlockAllMsgBlocks()
   566  	if err := fs.writeStreamMeta(); err != nil {
   567  		fs.lockAllMsgBlocks()
   568  		fs.cfg = old_cfg
   569  		fs.unlockAllMsgBlocks()
   570  		fs.mu.Unlock()
   571  		return err
   572  	}
   573  
   574  	// Limits checks and enforcement.
   575  	fs.enforceMsgLimit()
   576  	fs.enforceBytesLimit()
   577  
   578  	// Do age timers.
   579  	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
   580  		fs.startAgeChk()
   581  	}
   582  	if fs.ageChk != nil && fs.cfg.MaxAge == 0 {
   583  		fs.ageChk.Stop()
   584  		fs.ageChk = nil
   585  	}
   586  
   587  	if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer {
   588  		fs.enforceMsgPerSubjectLimit(true)
   589  	}
   590  	fs.mu.Unlock()
   591  
   592  	if cfg.MaxAge != 0 {
   593  		fs.expireMsgs()
   594  	}
   595  	return nil
   596  }
   597  
   598  func dynBlkSize(retention RetentionPolicy, maxBytes int64, encrypted bool) uint64 {
   599  	if maxBytes > 0 {
   600  		blkSize := (maxBytes / 4) + 1 // (25% overhead)
   601  		// Round up to nearest 100
   602  		if m := blkSize % 100; m != 0 {
   603  			blkSize += 100 - m
   604  		}
   605  		if blkSize <= FileStoreMinBlkSize {
   606  			blkSize = FileStoreMinBlkSize
   607  		} else if blkSize >= FileStoreMaxBlkSize {
   608  			blkSize = FileStoreMaxBlkSize
   609  		} else {
   610  			blkSize = defaultMediumBlockSize
   611  		}
   612  		if encrypted && blkSize > maximumEncryptedBlockSize {
   613  			// Notes on this below.
   614  			blkSize = maximumEncryptedBlockSize
   615  		}
   616  		return uint64(blkSize)
   617  	}
   618  
   619  	switch {
   620  	case encrypted:
   621  		// In the case of encrypted stores, large blocks can result in worsened perf
   622  		// since many writes on disk involve re-encrypting the entire block. For now,
   623  		// we will enforce a cap on the block size when encryption is enabled to avoid
   624  		// this.
   625  		return maximumEncryptedBlockSize
   626  	case retention == LimitsPolicy:
   627  		// TODO(dlc) - Make the blocksize relative to this if set.
   628  		return defaultLargeBlockSize
   629  	default:
   630  		// TODO(dlc) - Make the blocksize relative to this if set.
   631  		return defaultMediumBlockSize
   632  	}
   633  }
   634  
   635  func genEncryptionKey(sc StoreCipher, seed []byte) (ek cipher.AEAD, err error) {
   636  	if sc == ChaCha {
   637  		ek, err = chacha20poly1305.NewX(seed)
   638  	} else if sc == AES {
   639  		block, e := aes.NewCipher(seed)
   640  		if e != nil {
   641  			return nil, e
   642  		}
   643  		ek, err = cipher.NewGCMWithNonceSize(block, block.BlockSize())
   644  	} else {
   645  		err = errUnknownCipher
   646  	}
   647  	return ek, err
   648  }
   649  
   650  // Generate an asset encryption key from the context and server PRF.
   651  func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cipher.Stream, seed, encrypted []byte, err error) {
   652  	if fs.prf == nil {
   653  		return nil, nil, nil, nil, errNoEncryption
   654  	}
   655  	// Generate key encryption key.
   656  	rb, err := fs.prf([]byte(context))
   657  	if err != nil {
   658  		return nil, nil, nil, nil, err
   659  	}
   660  
   661  	sc := fs.fcfg.Cipher
   662  
   663  	kek, err := genEncryptionKey(sc, rb)
   664  	if err != nil {
   665  		return nil, nil, nil, nil, err
   666  	}
   667  	// Generate random asset encryption key seed.
   668  
   669  	const seedSize = 32
   670  	seed = make([]byte, seedSize)
   671  	if n, err := rand.Read(seed); err != nil {
   672  		return nil, nil, nil, nil, err
   673  	} else if n != seedSize {
   674  		return nil, nil, nil, nil, fmt.Errorf("not enough seed bytes read (%d != %d", n, seedSize)
   675  	}
   676  
   677  	aek, err = genEncryptionKey(sc, seed)
   678  	if err != nil {
   679  		return nil, nil, nil, nil, err
   680  	}
   681  
   682  	// Generate our nonce. Use same buffer to hold encrypted seed.
   683  	nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead())
   684  	if n, err := rand.Read(nonce); err != nil {
   685  		return nil, nil, nil, nil, err
   686  	} else if n != len(nonce) {
   687  		return nil, nil, nil, nil, fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce))
   688  	}
   689  
   690  	bek, err = genBlockEncryptionKey(sc, seed[:], nonce)
   691  	if err != nil {
   692  		return nil, nil, nil, nil, err
   693  	}
   694  
   695  	return aek, bek, seed, kek.Seal(nonce, nonce, seed, nil), nil
   696  }
   697  
   698  // Will generate the block encryption key.
   699  func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, error) {
   700  	if sc == ChaCha {
   701  		return chacha20.NewUnauthenticatedCipher(seed, nonce)
   702  	} else if sc == AES {
   703  		block, err := aes.NewCipher(seed)
   704  		if err != nil {
   705  			return nil, err
   706  		}
   707  		return cipher.NewCTR(block, nonce), nil
   708  	}
   709  	return nil, errUnknownCipher
   710  }
   711  
   712  // Lock should be held.
   713  func (fs *fileStore) recoverAEK() error {
   714  	if fs.prf != nil && fs.aek == nil {
   715  		ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey))
   716  		if err != nil {
   717  			return err
   718  		}
   719  		rb, err := fs.prf([]byte(fs.cfg.Name))
   720  		if err != nil {
   721  			return err
   722  		}
   723  		kek, err := genEncryptionKey(fs.fcfg.Cipher, rb)
   724  		if err != nil {
   725  			return err
   726  		}
   727  		ns := kek.NonceSize()
   728  		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
   729  		if err != nil {
   730  			return err
   731  		}
   732  		aek, err := genEncryptionKey(fs.fcfg.Cipher, seed)
   733  		if err != nil {
   734  			return err
   735  		}
   736  		fs.aek = aek
   737  	}
   738  	return nil
   739  }
   740  
   741  // Lock should be held.
   742  func (fs *fileStore) setupAEK() error {
   743  	if fs.prf != nil && fs.aek == nil {
   744  		key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name)
   745  		if err != nil {
   746  			return err
   747  		}
   748  		keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
   749  		if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
   750  			return err
   751  		}
   752  		<-dios
   753  		err = os.WriteFile(keyFile, encrypted, defaultFilePerms)
   754  		dios <- struct{}{}
   755  		if err != nil {
   756  			return err
   757  		}
   758  		// Set our aek.
   759  		fs.aek = key
   760  	}
   761  	return nil
   762  }
   763  
   764  // Write out meta and the checksum.
   765  // Lock should be held.
   766  func (fs *fileStore) writeStreamMeta() error {
   767  	if err := fs.setupAEK(); err != nil {
   768  		return err
   769  	}
   770  
   771  	meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)
   772  	if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
   773  		return err
   774  	}
   775  	b, err := json.Marshal(fs.cfg)
   776  	if err != nil {
   777  		return err
   778  	}
   779  	// Encrypt if needed.
   780  	if fs.aek != nil {
   781  		nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead())
   782  		if n, err := rand.Read(nonce); err != nil {
   783  			return err
   784  		} else if n != len(nonce) {
   785  			return fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce))
   786  		}
   787  		b = fs.aek.Seal(nonce, nonce, b, nil)
   788  	}
   789  
   790  	<-dios
   791  	err = os.WriteFile(meta, b, defaultFilePerms)
   792  	dios <- struct{}{}
   793  	if err != nil {
   794  		return err
   795  	}
   796  	fs.hh.Reset()
   797  	fs.hh.Write(b)
   798  	checksum := hex.EncodeToString(fs.hh.Sum(nil))
   799  	sum := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileSum)
   800  	<-dios
   801  	err = os.WriteFile(sum, []byte(checksum), defaultFilePerms)
   802  	dios <- struct{}{}
   803  	if err != nil {
   804  		return err
   805  	}
   806  	return nil
   807  }
   808  
   809  // Pools to recycle the blocks to help with memory pressure.
   810  var blkPoolBig sync.Pool    // 16MB
   811  var blkPoolMedium sync.Pool // 8MB
   812  var blkPoolSmall sync.Pool  // 2MB
   813  
   814  // Get a new msg block based on sz estimate.
   815  func getMsgBlockBuf(sz int) (buf []byte) {
   816  	var pb any
   817  	if sz <= defaultSmallBlockSize {
   818  		pb = blkPoolSmall.Get()
   819  	} else if sz <= defaultMediumBlockSize {
   820  		pb = blkPoolMedium.Get()
   821  	} else {
   822  		pb = blkPoolBig.Get()
   823  	}
   824  	if pb != nil {
   825  		buf = *(pb.(*[]byte))
   826  	} else {
   827  		// Here we need to make a new blk.
   828  		// If small leave as is..
   829  		if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize {
   830  			sz = defaultMediumBlockSize
   831  		} else if sz > defaultMediumBlockSize {
   832  			sz = defaultLargeBlockSize
   833  		}
   834  		buf = make([]byte, sz)
   835  	}
   836  	return buf[:0]
   837  }
   838  
   839  // Recycle the msg block.
   840  func recycleMsgBlockBuf(buf []byte) {
   841  	if buf == nil || cap(buf) < defaultSmallBlockSize {
   842  		return
   843  	}
   844  	// Make sure to reset before placing back into pool.
   845  	buf = buf[:0]
   846  
   847  	// We need to make sure the load code gets a block that can fit the maximum for a size block.
   848  	// E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting
   849  	// it right back in and making a new []byte.
   850  	// From above we know its already >= defaultSmallBlockSize
   851  	if sz := cap(buf); sz < defaultMediumBlockSize {
   852  		blkPoolSmall.Put(&buf)
   853  	} else if sz < defaultLargeBlockSize {
   854  		blkPoolMedium.Put(&buf)
   855  	} else {
   856  		blkPoolBig.Put(&buf)
   857  	}
   858  }
   859  
   860  const (
   861  	msgHdrSize     = 22
   862  	checksumSize   = 8
   863  	emptyRecordLen = msgHdrSize + checksumSize
   864  )
   865  
   866  // Lock should be held.
   867  func (fs *fileStore) noTrackSubjects() bool {
   868  	return !(fs.psim.Size() > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0)
   869  }
   870  
   871  // Will init the basics for a message block.
   872  func (fs *fileStore) initMsgBlock(index uint32) *msgBlock {
   873  	mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}
   874  
   875  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
   876  	mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index))
   877  
   878  	if mb.hh == nil {
   879  		key := sha256.Sum256(fs.hashKeyForBlock(index))
   880  		mb.hh, _ = highwayhash.New64(key[:])
   881  	}
   882  	return mb
   883  }
   884  
   885  // Lock for fs should be held.
   886  func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error {
   887  	if fs.prf == nil {
   888  		return nil
   889  	}
   890  
   891  	var createdKeys bool
   892  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
   893  	ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
   894  	if err != nil {
   895  		// We do not seem to have keys even though we should. Could be a plaintext conversion.
   896  		// Create the keys and we will double check below.
   897  		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
   898  			return err
   899  		}
   900  		createdKeys = true
   901  	} else {
   902  		if len(ekey) < minBlkKeySize {
   903  			return errBadKeySize
   904  		}
   905  		// Recover key encryption key.
   906  		rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
   907  		if err != nil {
   908  			return err
   909  		}
   910  
   911  		sc := fs.fcfg.Cipher
   912  		kek, err := genEncryptionKey(sc, rb)
   913  		if err != nil {
   914  			return err
   915  		}
   916  		ns := kek.NonceSize()
   917  		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
   918  		if err != nil {
   919  			// We may be here on a cipher conversion, so attempt to convert.
   920  			if err = mb.convertCipher(); err != nil {
   921  				return err
   922  			}
   923  		} else {
   924  			mb.seed, mb.nonce = seed, ekey[:ns]
   925  		}
   926  		mb.aek, err = genEncryptionKey(sc, mb.seed)
   927  		if err != nil {
   928  			return err
   929  		}
   930  		if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil {
   931  			return err
   932  		}
   933  	}
   934  
   935  	// If we created keys here, let's check the data and if it is plaintext convert here.
   936  	if createdKeys {
   937  		if err := mb.convertToEncrypted(); err != nil {
   938  			return err
   939  		}
   940  	}
   941  
   942  	return nil
   943  }
   944  
   945  // Load a last checksum if needed from the block file.
   946  // Lock should be held.
   947  func (mb *msgBlock) ensureLastChecksumLoaded() {
   948  	var empty [8]byte
   949  	if mb.lchk != empty {
   950  		return
   951  	}
   952  	copy(mb.lchk[0:], mb.lastChecksum())
   953  }
   954  
   955  // Lock held on entry
   956  func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) {
   957  	mb := fs.initMsgBlock(index)
   958  
   959  	// Open up the message file, but we will try to recover from the index file.
   960  	// We will check that the last checksums match.
   961  	file, err := mb.openBlock()
   962  	if err != nil {
   963  		return nil, err
   964  	}
   965  	defer file.Close()
   966  
   967  	if fi, err := file.Stat(); fi != nil {
   968  		mb.rbytes = uint64(fi.Size())
   969  	} else {
   970  		return nil, err
   971  	}
   972  
   973  	// Make sure encryption loaded if needed.
   974  	fs.loadEncryptionForMsgBlock(mb)
   975  
   976  	// Grab last checksum from main block file.
   977  	var lchk [8]byte
   978  	if mb.rbytes >= checksumSize {
   979  		if mb.bek != nil {
   980  			if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
   981  				mb.bek.XORKeyStream(buf, buf)
   982  				copy(lchk[0:], buf[len(buf)-checksumSize:])
   983  			}
   984  		} else {
   985  			file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
   986  		}
   987  	}
   988  
   989  	file.Close()
   990  
   991  	// Read our index file. Use this as source of truth if possible.
   992  	// This not applicable in >= 2.10 servers. Here for upgrade paths from < 2.10.
   993  	if err := mb.readIndexInfo(); err == nil {
   994  		// Quick sanity check here.
   995  		// Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty.
   996  		if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) {
   997  			if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
   998  				fs.populateGlobalPerSubjectInfo(mb)
   999  				// Try to dump any state we needed on recovery.
  1000  				mb.tryForceExpireCacheLocked()
  1001  			}
  1002  			fs.addMsgBlock(mb)
  1003  			return mb, nil
  1004  		}
  1005  	}
  1006  
  1007  	// If we get data loss rebuilding the message block state record that with the fs itself.
  1008  	ld, tombs, _ := mb.rebuildState()
  1009  	if ld != nil {
  1010  		fs.addLostData(ld)
  1011  	}
  1012  	// Collect all tombstones.
  1013  	if len(tombs) > 0 {
  1014  		fs.tombs = append(fs.tombs, tombs...)
  1015  	}
  1016  
  1017  	if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
  1018  		fs.populateGlobalPerSubjectInfo(mb)
  1019  		// Try to dump any state we needed on recovery.
  1020  		mb.tryForceExpireCacheLocked()
  1021  	}
  1022  
  1023  	mb.closeFDs()
  1024  	fs.addMsgBlock(mb)
  1025  
  1026  	return mb, nil
  1027  }
  1028  
  1029  func (fs *fileStore) lostData() *LostStreamData {
  1030  	fs.mu.RLock()
  1031  	defer fs.mu.RUnlock()
  1032  	if fs.ld == nil {
  1033  		return nil
  1034  	}
  1035  	nld := *fs.ld
  1036  	return &nld
  1037  }
  1038  
  1039  // Lock should be held.
  1040  func (fs *fileStore) addLostData(ld *LostStreamData) {
  1041  	if ld == nil {
  1042  		return
  1043  	}
  1044  	if fs.ld != nil {
  1045  		var added bool
  1046  		for _, seq := range ld.Msgs {
  1047  			if _, found := fs.ld.exists(seq); !found {
  1048  				fs.ld.Msgs = append(fs.ld.Msgs, seq)
  1049  				added = true
  1050  			}
  1051  		}
  1052  		if added {
  1053  			msgs := fs.ld.Msgs
  1054  			sort.Slice(msgs, func(i, j int) bool { return msgs[i] < msgs[j] })
  1055  			fs.ld.Bytes += ld.Bytes
  1056  		}
  1057  	} else {
  1058  		fs.ld = ld
  1059  	}
  1060  }
  1061  
  1062  // Helper to see if we already have this sequence reported in our lost data.
  1063  func (ld *LostStreamData) exists(seq uint64) (int, bool) {
  1064  	i, found := sort.Find(len(ld.Msgs), func(i int) int {
  1065  		tseq := ld.Msgs[i]
  1066  		if tseq < seq {
  1067  			return -1
  1068  		}
  1069  		if tseq > seq {
  1070  			return +1
  1071  		}
  1072  		return 0
  1073  	})
  1074  	return i, found
  1075  }
  1076  
  1077  func (fs *fileStore) removeFromLostData(seq uint64) {
  1078  	if fs.ld == nil {
  1079  		return
  1080  	}
  1081  	if i, found := fs.ld.exists(seq); found {
  1082  		fs.ld.Msgs = append(fs.ld.Msgs[:i], fs.ld.Msgs[i+1:]...)
  1083  		if len(fs.ld.Msgs) == 0 {
  1084  			fs.ld = nil
  1085  		}
  1086  	}
  1087  }
  1088  
  1089  func (fs *fileStore) rebuildState(ld *LostStreamData) {
  1090  	fs.mu.Lock()
  1091  	defer fs.mu.Unlock()
  1092  	fs.rebuildStateLocked(ld)
  1093  }
  1094  
  1095  // Lock should be held.
  1096  func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) {
  1097  	fs.addLostData(ld)
  1098  
  1099  	fs.state.Msgs, fs.state.Bytes = 0, 0
  1100  	fs.state.FirstSeq, fs.state.LastSeq = 0, 0
  1101  
  1102  	for _, mb := range fs.blks {
  1103  		mb.mu.RLock()
  1104  		fs.state.Msgs += mb.msgs
  1105  		fs.state.Bytes += mb.bytes
  1106  		fseq := atomic.LoadUint64(&mb.first.seq)
  1107  		if fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq {
  1108  			fs.state.FirstSeq = fseq
  1109  			fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  1110  		}
  1111  		fs.state.LastSeq = atomic.LoadUint64(&mb.last.seq)
  1112  		fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
  1113  		mb.mu.RUnlock()
  1114  	}
  1115  }
  1116  
  1117  // Attempt to convert the cipher used for this message block.
  1118  func (mb *msgBlock) convertCipher() error {
  1119  	fs := mb.fs
  1120  	sc := fs.fcfg.Cipher
  1121  
  1122  	var osc StoreCipher
  1123  	switch sc {
  1124  	case ChaCha:
  1125  		osc = AES
  1126  	case AES:
  1127  		osc = ChaCha
  1128  	}
  1129  
  1130  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  1131  	ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
  1132  	if err != nil {
  1133  		return err
  1134  	}
  1135  	if len(ekey) < minBlkKeySize {
  1136  		return errBadKeySize
  1137  	}
  1138  	type prfWithCipher struct {
  1139  		keyGen
  1140  		StoreCipher
  1141  	}
  1142  	var prfs []prfWithCipher
  1143  	if fs.prf != nil {
  1144  		prfs = append(prfs, prfWithCipher{fs.prf, sc})
  1145  		prfs = append(prfs, prfWithCipher{fs.prf, osc})
  1146  	}
  1147  	if fs.oldprf != nil {
  1148  		prfs = append(prfs, prfWithCipher{fs.oldprf, sc})
  1149  		prfs = append(prfs, prfWithCipher{fs.oldprf, osc})
  1150  	}
  1151  
  1152  	for _, prf := range prfs {
  1153  		// Recover key encryption key.
  1154  		rb, err := prf.keyGen([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
  1155  		if err != nil {
  1156  			continue
  1157  		}
  1158  		kek, err := genEncryptionKey(prf.StoreCipher, rb)
  1159  		if err != nil {
  1160  			continue
  1161  		}
  1162  		ns := kek.NonceSize()
  1163  		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
  1164  		if err != nil {
  1165  			continue
  1166  		}
  1167  		nonce := ekey[:ns]
  1168  		bek, err := genBlockEncryptionKey(prf.StoreCipher, seed, nonce)
  1169  		if err != nil {
  1170  			return err
  1171  		}
  1172  
  1173  		buf, _ := mb.loadBlock(nil)
  1174  		bek.XORKeyStream(buf, buf)
  1175  		// Make sure we can parse with old cipher and key file.
  1176  		if err = mb.indexCacheBuf(buf); err != nil {
  1177  			return err
  1178  		}
  1179  		// Reset the cache since we just read everything in.
  1180  		mb.cache = nil
  1181  
  1182  		// Generate new keys. If we error for some reason then we will put
  1183  		// the old keyfile back.
  1184  		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
  1185  			keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
  1186  			<-dios
  1187  			os.WriteFile(keyFile, ekey, defaultFilePerms)
  1188  			dios <- struct{}{}
  1189  			return err
  1190  		}
  1191  		mb.bek.XORKeyStream(buf, buf)
  1192  		<-dios
  1193  		err = os.WriteFile(mb.mfn, buf, defaultFilePerms)
  1194  		dios <- struct{}{}
  1195  		if err != nil {
  1196  			return err
  1197  		}
  1198  		return nil
  1199  	}
  1200  	return fmt.Errorf("unable to recover keys")
  1201  }
  1202  
  1203  // Convert a plaintext block to encrypted.
  1204  func (mb *msgBlock) convertToEncrypted() error {
  1205  	if mb.bek == nil {
  1206  		return nil
  1207  	}
  1208  	buf, err := mb.loadBlock(nil)
  1209  	if err != nil {
  1210  		return err
  1211  	}
  1212  	if err := mb.indexCacheBuf(buf); err != nil {
  1213  		// This likely indicates this was already encrypted or corrupt.
  1214  		mb.cache = nil
  1215  		return err
  1216  	}
  1217  	// Undo cache from above for later.
  1218  	mb.cache = nil
  1219  	mb.bek.XORKeyStream(buf, buf)
  1220  	<-dios
  1221  	err = os.WriteFile(mb.mfn, buf, defaultFilePerms)
  1222  	dios <- struct{}{}
  1223  	if err != nil {
  1224  		return err
  1225  	}
  1226  	return nil
  1227  }
  1228  
  1229  // Rebuild the state of the blk based on what we have on disk in the N.blk file.
  1230  // We will return any lost data, and we will return any delete tombstones we encountered.
  1231  func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) {
  1232  	mb.mu.Lock()
  1233  	defer mb.mu.Unlock()
  1234  	return mb.rebuildStateLocked()
  1235  }
  1236  
  1237  // Rebuild the state of the blk based on what we have on disk in the N.blk file.
  1238  // Lock should be held.
  1239  func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) {
  1240  	startLastSeq := atomic.LoadUint64(&mb.last.seq)
  1241  
  1242  	// Remove the .fss file and clear any cache we have set.
  1243  	mb.clearCacheAndOffset()
  1244  
  1245  	buf, err := mb.loadBlock(nil)
  1246  	defer recycleMsgBlockBuf(buf)
  1247  
  1248  	if err != nil || len(buf) == 0 {
  1249  		var ld *LostStreamData
  1250  		// No data to rebuild from here.
  1251  		if mb.msgs > 0 {
  1252  			// We need to declare lost data here.
  1253  			ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes}
  1254  			firstSeq, lastSeq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  1255  			for seq := firstSeq; seq <= lastSeq; seq++ {
  1256  				if !mb.dmap.Exists(seq) {
  1257  					ld.Msgs = append(ld.Msgs, seq)
  1258  				}
  1259  			}
  1260  			// Clear invalid state. We will let this blk be added in here.
  1261  			mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
  1262  			mb.dmap.Empty()
  1263  			atomic.StoreUint64(&mb.first.seq, atomic.LoadUint64(&mb.last.seq)+1)
  1264  		}
  1265  		return ld, nil, err
  1266  	}
  1267  
  1268  	// Clear state we need to rebuild.
  1269  	mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
  1270  	atomic.StoreUint64(&mb.last.seq, 0)
  1271  	mb.last.ts = 0
  1272  	firstNeedsSet := true
  1273  
  1274  	// Check if we need to decrypt.
  1275  	if mb.bek != nil && len(buf) > 0 {
  1276  		// Recreate to reset counter.
  1277  		mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  1278  		if err != nil {
  1279  			return nil, nil, err
  1280  		}
  1281  		mb.bek.XORKeyStream(buf, buf)
  1282  	}
  1283  
  1284  	// Check for compression.
  1285  	if buf, err = mb.decompressIfNeeded(buf); err != nil {
  1286  		return nil, nil, err
  1287  	}
  1288  
  1289  	mb.rbytes = uint64(len(buf))
  1290  
  1291  	addToDmap := func(seq uint64) {
  1292  		if seq == 0 {
  1293  			return
  1294  		}
  1295  		mb.dmap.Insert(seq)
  1296  	}
  1297  
  1298  	var le = binary.LittleEndian
  1299  
  1300  	truncate := func(index uint32) {
  1301  		var fd *os.File
  1302  		if mb.mfd != nil {
  1303  			fd = mb.mfd
  1304  		} else {
  1305  			<-dios
  1306  			fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
  1307  			dios <- struct{}{}
  1308  			if err == nil {
  1309  				defer fd.Close()
  1310  			}
  1311  		}
  1312  		if fd == nil {
  1313  			return
  1314  		}
  1315  		if err := fd.Truncate(int64(index)); err == nil {
  1316  			// Update our checksum.
  1317  			if index >= 8 {
  1318  				var lchk [8]byte
  1319  				fd.ReadAt(lchk[:], int64(index-8))
  1320  				copy(mb.lchk[0:], lchk[:])
  1321  			}
  1322  			fd.Sync()
  1323  		}
  1324  	}
  1325  
  1326  	gatherLost := func(lb uint32) *LostStreamData {
  1327  		var ld LostStreamData
  1328  		for seq := atomic.LoadUint64(&mb.last.seq) + 1; seq <= startLastSeq; seq++ {
  1329  			ld.Msgs = append(ld.Msgs, seq)
  1330  		}
  1331  		ld.Bytes = uint64(lb)
  1332  		return &ld
  1333  	}
  1334  
  1335  	// For tombstones that we find and collect.
  1336  	var (
  1337  		tombstones      []uint64
  1338  		minTombstoneSeq uint64
  1339  		minTombstoneTs  int64
  1340  	)
  1341  
  1342  	for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
  1343  		if index+msgHdrSize > lbuf {
  1344  			truncate(index)
  1345  			return gatherLost(lbuf - index), tombstones, nil
  1346  		}
  1347  
  1348  		hdr := buf[index : index+msgHdrSize]
  1349  		rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
  1350  
  1351  		hasHeaders := rl&hbit != 0
  1352  		// Clear any headers bit that could be set.
  1353  		rl &^= hbit
  1354  		dlen := int(rl) - msgHdrSize
  1355  		// Do some quick sanity checks here.
  1356  		if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
  1357  			truncate(index)
  1358  			return gatherLost(lbuf - index), tombstones, errBadMsg
  1359  		}
  1360  
  1361  		// Check for checksum failures before additional processing.
  1362  		data := buf[index+msgHdrSize : index+rl]
  1363  		if hh := mb.hh; hh != nil {
  1364  			hh.Reset()
  1365  			hh.Write(hdr[4:20])
  1366  			hh.Write(data[:slen])
  1367  			if hasHeaders {
  1368  				hh.Write(data[slen+4 : dlen-recordHashSize])
  1369  			} else {
  1370  				hh.Write(data[slen : dlen-recordHashSize])
  1371  			}
  1372  			checksum := hh.Sum(nil)
  1373  			if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) {
  1374  				truncate(index)
  1375  				return gatherLost(lbuf - index), tombstones, errBadMsg
  1376  			}
  1377  			copy(mb.lchk[0:], checksum)
  1378  		}
  1379  
  1380  		// Grab our sequence and timestamp.
  1381  		seq := le.Uint64(hdr[4:])
  1382  		ts := int64(le.Uint64(hdr[12:]))
  1383  
  1384  		// Check if this is a delete tombstone.
  1385  		if seq&tbit != 0 {
  1386  			seq = seq &^ tbit
  1387  			// Need to process this here and make sure we have accounted for this properly.
  1388  			tombstones = append(tombstones, seq)
  1389  			if minTombstoneSeq == 0 || seq < minTombstoneSeq {
  1390  				minTombstoneSeq, minTombstoneTs = seq, ts
  1391  			}
  1392  			index += rl
  1393  			continue
  1394  		}
  1395  
  1396  		fseq := atomic.LoadUint64(&mb.first.seq)
  1397  		// This is an old erased message, or a new one that we can track.
  1398  		if seq == 0 || seq&ebit != 0 || seq < fseq {
  1399  			seq = seq &^ ebit
  1400  			if seq >= fseq {
  1401  				// Only add to dmap if past recorded first seq and non-zero.
  1402  				if seq != 0 {
  1403  					addToDmap(seq)
  1404  				}
  1405  				atomic.StoreUint64(&mb.last.seq, seq)
  1406  				mb.last.ts = ts
  1407  				if mb.msgs == 0 {
  1408  					atomic.StoreUint64(&mb.first.seq, seq+1)
  1409  					mb.first.ts = 0
  1410  				}
  1411  			}
  1412  			index += rl
  1413  			continue
  1414  		}
  1415  
  1416  		// This is for when we have index info that adjusts for deleted messages
  1417  		// at the head. So the first.seq will be already set here. If this is larger
  1418  		// replace what we have with this seq.
  1419  		if firstNeedsSet && seq >= fseq {
  1420  			atomic.StoreUint64(&mb.first.seq, seq)
  1421  			firstNeedsSet, mb.first.ts = false, ts
  1422  		}
  1423  
  1424  		if !mb.dmap.Exists(seq) {
  1425  			mb.msgs++
  1426  			mb.bytes += uint64(rl)
  1427  		}
  1428  
  1429  		// Always set last
  1430  		atomic.StoreUint64(&mb.last.seq, seq)
  1431  		mb.last.ts = ts
  1432  
  1433  		// Advance to next record.
  1434  		index += rl
  1435  	}
  1436  
  1437  	// For empty msg blocks make sure we recover last seq correctly based off of first.
  1438  	// Or if we seem to have no messages but had a tombstone, which we use to remember
  1439  	// sequences and timestamps now, use that to properly setup the first and last.
  1440  	if mb.msgs == 0 {
  1441  		fseq := atomic.LoadUint64(&mb.first.seq)
  1442  		if fseq > 0 {
  1443  			atomic.StoreUint64(&mb.last.seq, fseq-1)
  1444  		} else if fseq == 0 && minTombstoneSeq > 0 {
  1445  			atomic.StoreUint64(&mb.first.seq, minTombstoneSeq+1)
  1446  			mb.first.ts = 0
  1447  			if mb.last.seq == 0 {
  1448  				atomic.StoreUint64(&mb.last.seq, minTombstoneSeq)
  1449  				mb.last.ts = minTombstoneTs
  1450  			}
  1451  		}
  1452  	}
  1453  
  1454  	return nil, tombstones, nil
  1455  }
  1456  
  1457  // For doing warn logging.
  1458  // Lock should be held.
  1459  func (fs *fileStore) warn(format string, args ...any) {
  1460  	// No-op if no server configured.
  1461  	if fs.srv == nil {
  1462  		return
  1463  	}
  1464  	fs.srv.Warnf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...)
  1465  }
  1466  
  1467  // For doing debug logging.
  1468  // Lock should be held.
  1469  func (fs *fileStore) debug(format string, args ...any) {
  1470  	// No-op if no server configured.
  1471  	if fs.srv == nil {
  1472  		return
  1473  	}
  1474  	fs.srv.Debugf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...)
  1475  }
  1476  
  1477  // Track local state but ignore timestamps here.
  1478  func updateTrackingState(state *StreamState, mb *msgBlock) {
  1479  	if state.FirstSeq == 0 {
  1480  		state.FirstSeq = mb.first.seq
  1481  	} else if mb.first.seq < state.FirstSeq {
  1482  		state.FirstSeq = mb.first.seq
  1483  	}
  1484  	if mb.last.seq > state.LastSeq {
  1485  		state.LastSeq = mb.last.seq
  1486  	}
  1487  	state.Msgs += mb.msgs
  1488  	state.Bytes += mb.bytes
  1489  }
  1490  
  1491  // Determine if our tracking states are the same.
  1492  func trackingStatesEqual(fs, mb *StreamState) bool {
  1493  	// When a fs is brand new the fs state will have first seq of 0, but tracking mb may have 1.
  1494  	// If either has a first sequence that is not 0 or 1 we will check if they are the same, otherwise skip.
  1495  	if (fs.FirstSeq > 1 && mb.FirstSeq > 1) || mb.FirstSeq > 1 {
  1496  		return fs.Msgs == mb.Msgs && fs.FirstSeq == mb.FirstSeq && fs.LastSeq == mb.LastSeq && fs.Bytes == mb.Bytes
  1497  	}
  1498  	return fs.Msgs == mb.Msgs && fs.LastSeq == mb.LastSeq && fs.Bytes == mb.Bytes
  1499  }
  1500  
  1501  // recoverFullState will attempt to receover our last full state and re-process any state changes
  1502  // that happened afterwards.
  1503  func (fs *fileStore) recoverFullState() (rerr error) {
  1504  	fs.mu.Lock()
  1505  	defer fs.mu.Unlock()
  1506  
  1507  	// Check for any left over purged messages.
  1508  	<-dios
  1509  	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
  1510  	if _, err := os.Stat(pdir); err == nil {
  1511  		os.RemoveAll(pdir)
  1512  	}
  1513  	// Grab our stream state file and load it in.
  1514  	fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
  1515  	buf, err := os.ReadFile(fn)
  1516  	dios <- struct{}{}
  1517  
  1518  	if err != nil {
  1519  		if !os.IsNotExist(err) {
  1520  			fs.warn("Could not read stream state file: %v", err)
  1521  		}
  1522  		return err
  1523  	}
  1524  
  1525  	const minLen = 32
  1526  	if len(buf) < minLen {
  1527  		os.Remove(fn)
  1528  		fs.warn("Stream state too short (%d bytes)", len(buf))
  1529  		return errCorruptState
  1530  	}
  1531  
  1532  	// The highwayhash will be on the end. Check that it still matches.
  1533  	h := buf[len(buf)-highwayhash.Size64:]
  1534  	buf = buf[:len(buf)-highwayhash.Size64]
  1535  	fs.hh.Reset()
  1536  	fs.hh.Write(buf)
  1537  	if !bytes.Equal(h, fs.hh.Sum(nil)) {
  1538  		os.Remove(fn)
  1539  		fs.warn("Stream state checksum did not match")
  1540  		return errCorruptState
  1541  	}
  1542  
  1543  	// Decrypt if needed.
  1544  	if fs.prf != nil {
  1545  		// We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile
  1546  		// since snapshots strip encryption.
  1547  		if err := fs.recoverAEK(); err == nil {
  1548  			ns := fs.aek.NonceSize()
  1549  			buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil)
  1550  			if err != nil {
  1551  				fs.warn("Stream state error reading encryption key: %v", err)
  1552  				return err
  1553  			}
  1554  		}
  1555  	}
  1556  
  1557  	if buf[0] != fullStateMagic || buf[1] != fullStateVersion {
  1558  		os.Remove(fn)
  1559  		fs.warn("Stream state magic and version mismatch")
  1560  		return errCorruptState
  1561  	}
  1562  
  1563  	bi := hdrLen
  1564  
  1565  	readU64 := func() uint64 {
  1566  		if bi < 0 {
  1567  			return 0
  1568  		}
  1569  		v, n := binary.Uvarint(buf[bi:])
  1570  		if n <= 0 {
  1571  			bi = -1
  1572  			return 0
  1573  		}
  1574  		bi += n
  1575  		return v
  1576  	}
  1577  	readI64 := func() int64 {
  1578  		if bi < 0 {
  1579  			return 0
  1580  		}
  1581  		v, n := binary.Varint(buf[bi:])
  1582  		if n <= 0 {
  1583  			bi = -1
  1584  			return -1
  1585  		}
  1586  		bi += n
  1587  		return v
  1588  	}
  1589  
  1590  	setTime := func(t *time.Time, ts int64) {
  1591  		if ts == 0 {
  1592  			*t = time.Time{}
  1593  		} else {
  1594  			*t = time.Unix(0, ts).UTC()
  1595  		}
  1596  	}
  1597  
  1598  	var state StreamState
  1599  	state.Msgs = readU64()
  1600  	state.Bytes = readU64()
  1601  	state.FirstSeq = readU64()
  1602  	baseTime := readI64()
  1603  	setTime(&state.FirstTime, baseTime)
  1604  	state.LastSeq = readU64()
  1605  	setTime(&state.LastTime, readI64())
  1606  
  1607  	// Check for per subject info.
  1608  	if numSubjects := int(readU64()); numSubjects > 0 {
  1609  		fs.psim, fs.tsl = fs.psim.Empty(), 0
  1610  		for i := 0; i < numSubjects; i++ {
  1611  			if lsubj := int(readU64()); lsubj > 0 {
  1612  				if bi+lsubj > len(buf) {
  1613  					os.Remove(fn)
  1614  					fs.warn("Stream state bad subject len (%d)", lsubj)
  1615  					return errCorruptState
  1616  				}
  1617  				// If we have lots of subjects this will alloc for each one.
  1618  				// We could reference the underlying buffer, but we could guess wrong if
  1619  				// number of blocks is large and subjects is low, since we would reference buf.
  1620  				subj := buf[bi : bi+lsubj]
  1621  				// We had a bug that could cause memory corruption in the PSIM that could have gotten stored to disk.
  1622  				// Only would affect subjects, so do quick check.
  1623  				if !isValidSubject(string(subj), true) {
  1624  					os.Remove(fn)
  1625  					fs.warn("Stream state corrupt subject detected")
  1626  					return errCorruptState
  1627  				}
  1628  				bi += lsubj
  1629  				psi := psi{total: readU64(), fblk: uint32(readU64())}
  1630  				if psi.total > 1 {
  1631  					psi.lblk = uint32(readU64())
  1632  				} else {
  1633  					psi.lblk = psi.fblk
  1634  				}
  1635  				fs.psim.Insert(subj, psi)
  1636  				fs.tsl += lsubj
  1637  			}
  1638  		}
  1639  	}
  1640  
  1641  	// Track the state as represented by the blocks themselves.
  1642  	var mstate StreamState
  1643  
  1644  	if numBlocks := readU64(); numBlocks > 0 {
  1645  		lastIndex := int(numBlocks - 1)
  1646  		fs.blks = make([]*msgBlock, 0, numBlocks)
  1647  		for i := 0; i < int(numBlocks); i++ {
  1648  			index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64()
  1649  			if bi < 0 {
  1650  				break
  1651  			}
  1652  			mb := fs.initMsgBlock(index)
  1653  			atomic.StoreUint64(&mb.first.seq, fseq)
  1654  			atomic.StoreUint64(&mb.last.seq, lseq)
  1655  			mb.msgs, mb.bytes = lseq-fseq+1, nbytes
  1656  			mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime
  1657  			if numDeleted > 0 {
  1658  				dmap, n, err := avl.Decode(buf[bi:])
  1659  				if err != nil {
  1660  					os.Remove(fn)
  1661  					fs.warn("Stream state error decoding avl dmap: %v", err)
  1662  					return errCorruptState
  1663  				}
  1664  				mb.dmap = *dmap
  1665  				if mb.msgs > numDeleted {
  1666  					mb.msgs -= numDeleted
  1667  				} else {
  1668  					mb.msgs = 0
  1669  				}
  1670  				bi += n
  1671  			}
  1672  			// Only add in if not empty or the lmb.
  1673  			if mb.msgs > 0 || i == lastIndex {
  1674  				fs.addMsgBlock(mb)
  1675  				updateTrackingState(&mstate, mb)
  1676  			} else {
  1677  				// Mark dirty to cleanup.
  1678  				fs.dirty++
  1679  			}
  1680  		}
  1681  	}
  1682  
  1683  	// Pull in last block index for the block that had last checksum when we wrote the full state.
  1684  	blkIndex := uint32(readU64())
  1685  	var lchk [8]byte
  1686  	if bi+len(lchk) > len(buf) {
  1687  		bi = -1
  1688  	} else {
  1689  		copy(lchk[0:], buf[bi:bi+len(lchk)])
  1690  	}
  1691  
  1692  	// Check if we had any errors.
  1693  	if bi < 0 {
  1694  		os.Remove(fn)
  1695  		fs.warn("Stream state has no checksum present")
  1696  		return errCorruptState
  1697  	}
  1698  
  1699  	// Move into place our state, msgBlks and subject info.
  1700  	fs.state = state
  1701  
  1702  	// First let's check the happy path, open the blk file that was the lmb when we created the full state.
  1703  	// See if we have the last block available.
  1704  	var matched bool
  1705  	mb := fs.lmb
  1706  	if mb == nil || mb.index != blkIndex {
  1707  		fs.warn("Stream state block does not exist or index mismatch")
  1708  		return errCorruptState
  1709  	}
  1710  	if _, err := os.Stat(mb.mfn); err != nil && os.IsNotExist(err) {
  1711  		// If our saved state is past what we see on disk, fallback and rebuild.
  1712  		if ld, _, _ := mb.rebuildState(); ld != nil {
  1713  			fs.addLostData(ld)
  1714  		}
  1715  		fs.warn("Stream state detected prior state, could not locate msg block %d", blkIndex)
  1716  		return errPriorState
  1717  	}
  1718  	if matched = bytes.Equal(mb.lastChecksum(), lchk[:]); !matched {
  1719  		// Remove the last message block since recover will add in the new one.
  1720  		fs.removeMsgBlockFromList(mb)
  1721  		// Reverse update of tracking state for this mb, will add new state in below.
  1722  		mstate.Msgs -= mb.msgs
  1723  		mstate.Bytes -= mb.bytes
  1724  		if nmb, err := fs.recoverMsgBlock(mb.index); err != nil && !os.IsNotExist(err) {
  1725  			fs.warn("Stream state could not recover last msg block")
  1726  			os.Remove(fn)
  1727  			return errCorruptState
  1728  		} else if nmb != nil {
  1729  			fs.adjustAccounting(mb, nmb)
  1730  			updateTrackingState(&mstate, nmb)
  1731  		}
  1732  	}
  1733  
  1734  	// On success double check our state.
  1735  	checkState := func() error {
  1736  		// We check first and last seq and number of msgs and bytes. If there is a difference,
  1737  		// return and error so we rebuild from the message block state on disk.
  1738  		if !trackingStatesEqual(&fs.state, &mstate) {
  1739  			fs.warn("Stream state encountered internal inconsistency on recover")
  1740  			os.Remove(fn)
  1741  			return errCorruptState
  1742  		}
  1743  		return nil
  1744  	}
  1745  
  1746  	// We may need to check other blocks. Even if we matched last checksum we will see if there is another block.
  1747  	for bi := blkIndex + 1; ; bi++ {
  1748  		nmb, err := fs.recoverMsgBlock(bi)
  1749  		if err != nil {
  1750  			if os.IsNotExist(err) {
  1751  				return checkState()
  1752  			}
  1753  			os.Remove(fn)
  1754  			fs.warn("Stream state could not recover msg block %d", bi)
  1755  			return err
  1756  		}
  1757  		if nmb != nil {
  1758  			// Update top level accounting
  1759  			if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq {
  1760  				fs.state.FirstSeq = fseq
  1761  				fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC()
  1762  			}
  1763  			if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq {
  1764  				fs.state.LastSeq = lseq
  1765  				fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC()
  1766  			}
  1767  			fs.state.Msgs += nmb.msgs
  1768  			fs.state.Bytes += nmb.bytes
  1769  			updateTrackingState(&mstate, nmb)
  1770  		}
  1771  	}
  1772  }
  1773  
  1774  // adjustAccounting will be called when a stream state was only partially accounted for
  1775  // within a message block, e.g. additional records were added after the stream state.
  1776  // Lock should be held.
  1777  func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) {
  1778  	nmb.mu.Lock()
  1779  	defer nmb.mu.Unlock()
  1780  
  1781  	// First make sure the new block is loaded.
  1782  	if nmb.cacheNotLoaded() {
  1783  		nmb.loadMsgsWithLock()
  1784  	}
  1785  	nmb.ensurePerSubjectInfoLoaded()
  1786  
  1787  	var smv StoreMsg
  1788  
  1789  	// Need to walk previous messages and undo psim stats.
  1790  	// We already undid msgs and bytes accounting.
  1791  	for seq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ {
  1792  		// Lookup the message. If an error will be deleted, so can skip.
  1793  		sm, err := nmb.cacheLookup(seq, &smv)
  1794  		if err != nil {
  1795  			continue
  1796  		}
  1797  		if len(sm.subj) > 0 && fs.psim != nil {
  1798  			if info, ok := fs.psim.Find(stringToBytes(sm.subj)); ok {
  1799  				info.total--
  1800  			}
  1801  		}
  1802  	}
  1803  
  1804  	// Walk only new messages and update accounting at fs level. Any messages that should have
  1805  	// triggered limits exceeded will be handled after the recovery and prior to the stream
  1806  	// being available to the system.
  1807  	for seq, lseq := atomic.LoadUint64(&mb.last.seq)+1, atomic.LoadUint64(&nmb.last.seq); seq <= lseq; seq++ {
  1808  		// Lookup the message. If an error will be deleted, so can skip.
  1809  		sm, err := nmb.cacheLookup(seq, &smv)
  1810  		if err != nil {
  1811  			continue
  1812  		}
  1813  		// Since we found it we just need to adjust fs totals and psim.
  1814  		fs.state.Msgs++
  1815  		fs.state.Bytes += fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  1816  	}
  1817  
  1818  	// Now check to see if we had a higher first for the recovered state mb vs nmb.
  1819  	if atomic.LoadUint64(&nmb.first.seq) < atomic.LoadUint64(&mb.first.seq) {
  1820  		// Now set first for nmb.
  1821  		atomic.StoreUint64(&nmb.first.seq, atomic.LoadUint64(&mb.first.seq))
  1822  	}
  1823  
  1824  	// Update top level accounting.
  1825  	if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq {
  1826  		fs.state.FirstSeq = fseq
  1827  		fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC()
  1828  	}
  1829  	if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq {
  1830  		fs.state.LastSeq = lseq
  1831  		fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC()
  1832  	}
  1833  }
  1834  
  1835  // Grabs last checksum for the named block file.
  1836  // Takes into account encryption etc.
  1837  func (mb *msgBlock) lastChecksum() []byte {
  1838  	f, err := mb.openBlock()
  1839  	if err != nil {
  1840  		return nil
  1841  	}
  1842  	defer f.Close()
  1843  
  1844  	var lchk [8]byte
  1845  	if fi, _ := f.Stat(); fi != nil {
  1846  		mb.rbytes = uint64(fi.Size())
  1847  	}
  1848  	if mb.rbytes < checksumSize {
  1849  		return nil
  1850  	}
  1851  	// Encrypted?
  1852  	// Check for encryption, we do not load keys on startup anymore so might need to load them here.
  1853  	if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
  1854  		if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
  1855  			return nil
  1856  		}
  1857  	}
  1858  	if mb.bek != nil {
  1859  		if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
  1860  			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  1861  			if err != nil {
  1862  				return nil
  1863  			}
  1864  			mb.bek = bek
  1865  			mb.bek.XORKeyStream(buf, buf)
  1866  			copy(lchk[0:], buf[len(buf)-checksumSize:])
  1867  		}
  1868  	} else {
  1869  		f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
  1870  	}
  1871  	return lchk[:]
  1872  }
  1873  
  1874  // This will make sure we clean up old idx and fss files.
  1875  func (fs *fileStore) cleanupOldMeta() {
  1876  	fs.mu.RLock()
  1877  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  1878  	fs.mu.RUnlock()
  1879  
  1880  	<-dios
  1881  	f, err := os.Open(mdir)
  1882  	dios <- struct{}{}
  1883  	if err != nil {
  1884  		return
  1885  	}
  1886  
  1887  	dirs, _ := f.ReadDir(-1)
  1888  	f.Close()
  1889  
  1890  	const (
  1891  		minLen    = 4
  1892  		idxSuffix = ".idx"
  1893  		fssSuffix = ".fss"
  1894  	)
  1895  	for _, fi := range dirs {
  1896  		if name := fi.Name(); strings.HasSuffix(name, idxSuffix) || strings.HasSuffix(name, fssSuffix) {
  1897  			os.Remove(filepath.Join(mdir, name))
  1898  		}
  1899  	}
  1900  }
  1901  
  1902  func (fs *fileStore) recoverMsgs() error {
  1903  	fs.mu.Lock()
  1904  	defer fs.mu.Unlock()
  1905  
  1906  	// Check for any left over purged messages.
  1907  	<-dios
  1908  	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
  1909  	if _, err := os.Stat(pdir); err == nil {
  1910  		os.RemoveAll(pdir)
  1911  	}
  1912  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  1913  	f, err := os.Open(mdir)
  1914  	if err != nil {
  1915  		dios <- struct{}{}
  1916  		return errNotReadable
  1917  	}
  1918  	dirs, err := f.ReadDir(-1)
  1919  	f.Close()
  1920  	dios <- struct{}{}
  1921  
  1922  	if err != nil {
  1923  		return errNotReadable
  1924  	}
  1925  
  1926  	indices := make(sort.IntSlice, 0, len(dirs))
  1927  	var index int
  1928  	for _, fi := range dirs {
  1929  		if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 {
  1930  			indices = append(indices, index)
  1931  		}
  1932  	}
  1933  	indices.Sort()
  1934  
  1935  	// Recover all of the msg blocks.
  1936  	// We now guarantee they are coming in order.
  1937  	for _, index := range indices {
  1938  		if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil {
  1939  			// This is a truncate block with possibly no index. If the OS got shutdown
  1940  			// out from underneath of us this is possible.
  1941  			if mb.first.seq == 0 {
  1942  				mb.dirtyCloseWithRemove(true)
  1943  				fs.removeMsgBlockFromList(mb)
  1944  				continue
  1945  			}
  1946  			if fseq := atomic.LoadUint64(&mb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq {
  1947  				fs.state.FirstSeq = fseq
  1948  				if mb.first.ts == 0 {
  1949  					fs.state.FirstTime = time.Time{}
  1950  				} else {
  1951  					fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  1952  				}
  1953  			}
  1954  			if lseq := atomic.LoadUint64(&mb.last.seq); lseq > fs.state.LastSeq {
  1955  				fs.state.LastSeq = lseq
  1956  				if mb.last.ts == 0 {
  1957  					fs.state.LastTime = time.Time{}
  1958  				} else {
  1959  					fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
  1960  				}
  1961  			}
  1962  			fs.state.Msgs += mb.msgs
  1963  			fs.state.Bytes += mb.bytes
  1964  		} else {
  1965  			return err
  1966  		}
  1967  	}
  1968  
  1969  	if len(fs.blks) > 0 {
  1970  		fs.lmb = fs.blks[len(fs.blks)-1]
  1971  	} else {
  1972  		_, err = fs.newMsgBlockForWrite()
  1973  	}
  1974  
  1975  	// Check if we encountered any lost data.
  1976  	if fs.ld != nil {
  1977  		var emptyBlks []*msgBlock
  1978  		for _, mb := range fs.blks {
  1979  			if mb.msgs == 0 && mb.rbytes == 0 {
  1980  				emptyBlks = append(emptyBlks, mb)
  1981  			}
  1982  		}
  1983  		for _, mb := range emptyBlks {
  1984  			// Need the mb lock here.
  1985  			mb.mu.Lock()
  1986  			fs.removeMsgBlock(mb)
  1987  			mb.mu.Unlock()
  1988  		}
  1989  	}
  1990  
  1991  	if err != nil {
  1992  		return err
  1993  	}
  1994  
  1995  	// Check for keyfiles orphans.
  1996  	if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 {
  1997  		valid := make(map[uint32]bool)
  1998  		for _, mb := range fs.blks {
  1999  			valid[mb.index] = true
  2000  		}
  2001  		for _, fn := range kms {
  2002  			var index uint32
  2003  			shouldRemove := true
  2004  			if n, err := fmt.Sscanf(filepath.Base(fn), keyScan, &index); err == nil && n == 1 && valid[index] {
  2005  				shouldRemove = false
  2006  			}
  2007  			if shouldRemove {
  2008  				os.Remove(fn)
  2009  			}
  2010  		}
  2011  	}
  2012  
  2013  	return nil
  2014  }
  2015  
  2016  // Will expire msgs that have aged out on restart.
  2017  // We will treat this differently in case we have a recovery
  2018  // that will expire alot of messages on startup.
  2019  // Should only be called on startup.
  2020  func (fs *fileStore) expireMsgsOnRecover() {
  2021  	if fs.state.Msgs == 0 {
  2022  		return
  2023  	}
  2024  
  2025  	var minAge = time.Now().UnixNano() - int64(fs.cfg.MaxAge)
  2026  	var purged, bytes uint64
  2027  	var deleted int
  2028  	var nts int64
  2029  
  2030  	// If we expire all make sure to write out a tombstone. Need to be done by hand here,
  2031  	// usually taken care of by fs.removeMsgBlock() but we do not call that here.
  2032  	var last msgId
  2033  
  2034  	deleteEmptyBlock := func(mb *msgBlock) {
  2035  		// If we are the last keep state to remember first/last sequence.
  2036  		// Do this part by hand since not deleting one by one.
  2037  		if mb == fs.lmb {
  2038  			last.seq = atomic.LoadUint64(&mb.last.seq)
  2039  			last.ts = mb.last.ts
  2040  		}
  2041  		// Make sure we do subject cleanup as well.
  2042  		mb.ensurePerSubjectInfoLoaded()
  2043  		for subj, ss := range mb.fss {
  2044  			for i := uint64(0); i < ss.Msgs; i++ {
  2045  				fs.removePerSubject(subj)
  2046  			}
  2047  		}
  2048  		mb.dirtyCloseWithRemove(true)
  2049  		deleted++
  2050  	}
  2051  
  2052  	for _, mb := range fs.blks {
  2053  		mb.mu.Lock()
  2054  		if minAge < mb.first.ts {
  2055  			nts = mb.first.ts
  2056  			mb.mu.Unlock()
  2057  			break
  2058  		}
  2059  		// Can we remove whole block here?
  2060  		if mb.last.ts <= minAge {
  2061  			purged += mb.msgs
  2062  			bytes += mb.bytes
  2063  			deleteEmptyBlock(mb)
  2064  			mb.mu.Unlock()
  2065  			continue
  2066  		}
  2067  
  2068  		// If we are here we have to process the interior messages of this blk.
  2069  		// This will load fss as well.
  2070  		if err := mb.loadMsgsWithLock(); err != nil {
  2071  			mb.mu.Unlock()
  2072  			break
  2073  		}
  2074  
  2075  		var smv StoreMsg
  2076  		var needNextFirst bool
  2077  
  2078  		// Walk messages and remove if expired.
  2079  		fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  2080  		for seq := fseq; seq <= lseq; seq++ {
  2081  			sm, err := mb.cacheLookup(seq, &smv)
  2082  			// Process interior deleted msgs.
  2083  			if err == errDeletedMsg {
  2084  				// Update dmap.
  2085  				if mb.dmap.Exists(seq) {
  2086  					mb.dmap.Delete(seq)
  2087  				}
  2088  				// Keep this updated just in case since we are removing dmap entries.
  2089  				atomic.StoreUint64(&mb.first.seq, seq)
  2090  				needNextFirst = true
  2091  				continue
  2092  			}
  2093  			// Break on other errors.
  2094  			if err != nil || sm == nil {
  2095  				atomic.StoreUint64(&mb.first.seq, seq)
  2096  				needNextFirst = true
  2097  				break
  2098  			}
  2099  
  2100  			// No error and sm != nil from here onward.
  2101  
  2102  			// Check for done.
  2103  			if minAge < sm.ts {
  2104  				atomic.StoreUint64(&mb.first.seq, sm.seq)
  2105  				mb.first.ts = sm.ts
  2106  				needNextFirst = false
  2107  				nts = sm.ts
  2108  				break
  2109  			}
  2110  
  2111  			// Delete the message here.
  2112  			if mb.msgs > 0 {
  2113  				atomic.StoreUint64(&mb.first.seq, seq)
  2114  				needNextFirst = true
  2115  				sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  2116  				if sz > mb.bytes {
  2117  					sz = mb.bytes
  2118  				}
  2119  				mb.bytes -= sz
  2120  				bytes += sz
  2121  				mb.msgs--
  2122  				purged++
  2123  			}
  2124  			// Update fss
  2125  			// Make sure we have fss loaded.
  2126  			mb.removeSeqPerSubject(sm.subj, seq)
  2127  			fs.removePerSubject(sm.subj)
  2128  		}
  2129  		// Make sure we have a proper next first sequence.
  2130  		if needNextFirst {
  2131  			mb.selectNextFirst()
  2132  		}
  2133  		// Check if empty after processing, could happen if tail of messages are all deleted.
  2134  		if mb.msgs == 0 {
  2135  			deleteEmptyBlock(mb)
  2136  		}
  2137  		mb.mu.Unlock()
  2138  		break
  2139  	}
  2140  
  2141  	if nts > 0 {
  2142  		// Make sure to set age check based on this value.
  2143  		fs.resetAgeChk(nts - minAge)
  2144  	}
  2145  
  2146  	if deleted > 0 {
  2147  		// Update block map.
  2148  		if fs.bim != nil {
  2149  			for _, mb := range fs.blks[:deleted] {
  2150  				delete(fs.bim, mb.index)
  2151  			}
  2152  		}
  2153  		// Update blks slice.
  2154  		fs.blks = copyMsgBlocks(fs.blks[deleted:])
  2155  		if lb := len(fs.blks); lb == 0 {
  2156  			fs.lmb = nil
  2157  		} else {
  2158  			fs.lmb = fs.blks[lb-1]
  2159  		}
  2160  	}
  2161  	// Update top level accounting.
  2162  	if purged < fs.state.Msgs {
  2163  		fs.state.Msgs -= purged
  2164  	} else {
  2165  		fs.state.Msgs = 0
  2166  	}
  2167  	if bytes < fs.state.Bytes {
  2168  		fs.state.Bytes -= bytes
  2169  	} else {
  2170  		fs.state.Bytes = 0
  2171  	}
  2172  	// Make sure to we properly set the fs first sequence and timestamp.
  2173  	fs.selectNextFirst()
  2174  
  2175  	// Check if we have no messages and blocks left.
  2176  	if fs.lmb == nil && last.seq != 0 {
  2177  		if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
  2178  			lmb.writeTombstone(last.seq, last.ts)
  2179  		}
  2180  		// Clear any global subject state.
  2181  		fs.psim, fs.tsl = fs.psim.Empty(), 0
  2182  	}
  2183  
  2184  	// If we purged anything, make sure we kick flush state loop.
  2185  	if purged > 0 {
  2186  		fs.dirty++
  2187  	}
  2188  }
  2189  
  2190  func copyMsgBlocks(src []*msgBlock) []*msgBlock {
  2191  	if src == nil {
  2192  		return nil
  2193  	}
  2194  	dst := make([]*msgBlock, len(src))
  2195  	copy(dst, src)
  2196  	return dst
  2197  }
  2198  
  2199  // GetSeqFromTime looks for the first sequence number that has
  2200  // the message with >= timestamp.
  2201  // FIXME(dlc) - inefficient, and dumb really. Make this better.
  2202  func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 {
  2203  	fs.mu.RLock()
  2204  	lastSeq := fs.state.LastSeq
  2205  	closed := fs.closed
  2206  	fs.mu.RUnlock()
  2207  
  2208  	if closed {
  2209  		return 0
  2210  	}
  2211  
  2212  	mb := fs.selectMsgBlockForStart(t)
  2213  	if mb == nil {
  2214  		return lastSeq + 1
  2215  	}
  2216  
  2217  	fseq := atomic.LoadUint64(&mb.first.seq)
  2218  	lseq := atomic.LoadUint64(&mb.last.seq)
  2219  
  2220  	var smv StoreMsg
  2221  
  2222  	// Linear search, hence the dumb part..
  2223  	ts := t.UnixNano()
  2224  	for seq := fseq; seq <= lseq; seq++ {
  2225  		sm, _, _ := mb.fetchMsg(seq, &smv)
  2226  		if sm != nil && sm.ts >= ts {
  2227  			return sm.seq
  2228  		}
  2229  	}
  2230  	return 0
  2231  }
  2232  
  2233  // Find the first matching message against a sublist.
  2234  func (mb *msgBlock) firstMatchingMulti(sl *Sublist, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
  2235  	mb.mu.Lock()
  2236  	defer mb.mu.Unlock()
  2237  
  2238  	// Will just do linear walk for now.
  2239  	// TODO(dlc) - Be better at skipping blocks that will not match us regardless.
  2240  
  2241  	var didLoad bool
  2242  	// Need messages loaded from here on out.
  2243  	if mb.cacheNotLoaded() {
  2244  		if err := mb.loadMsgsWithLock(); err != nil {
  2245  			return nil, false, err
  2246  		}
  2247  		didLoad = true
  2248  	}
  2249  
  2250  	// Make sure to start at mb.first.seq if fseq < mb.first.seq
  2251  	if seq := atomic.LoadUint64(&mb.first.seq); seq > start {
  2252  		start = seq
  2253  	}
  2254  	lseq := atomic.LoadUint64(&mb.last.seq)
  2255  
  2256  	if sm == nil {
  2257  		sm = new(StoreMsg)
  2258  	}
  2259  
  2260  	var result SublistResult
  2261  	for seq := start; seq <= lseq; seq++ {
  2262  		llseq := mb.llseq
  2263  		fsm, err := mb.cacheLookup(seq, sm)
  2264  		if err != nil {
  2265  			continue
  2266  		}
  2267  		expireOk := seq == lseq && mb.llseq == seq
  2268  
  2269  		if r := sl.MatchWithResult(fsm.subj, &result); len(r.psubs) > 0 {
  2270  			return fsm, expireOk, nil
  2271  		}
  2272  		// If we are here we did not match, so put the llseq back.
  2273  		mb.llseq = llseq
  2274  	}
  2275  	return nil, didLoad, ErrStoreMsgNotFound
  2276  }
  2277  
  2278  // Find the first matching message.
  2279  func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
  2280  	mb.mu.Lock()
  2281  	defer mb.mu.Unlock()
  2282  
  2283  	fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter}
  2284  
  2285  	var didLoad bool
  2286  	if mb.fssNotLoaded() {
  2287  		// Make sure we have fss loaded.
  2288  		mb.loadMsgsWithLock()
  2289  		didLoad = true
  2290  	}
  2291  
  2292  	// If we only have 1 subject currently and it matches our filter we can also set isAll.
  2293  	if !isAll && len(mb.fss) == 1 {
  2294  		_, isAll = mb.fss[filter]
  2295  	}
  2296  	// Make sure to start at mb.first.seq if fseq < mb.first.seq
  2297  	if seq := atomic.LoadUint64(&mb.first.seq); seq > fseq {
  2298  		fseq = seq
  2299  	}
  2300  	lseq := atomic.LoadUint64(&mb.last.seq)
  2301  
  2302  	// Optionally build the isMatch for wildcard filters.
  2303  	tsa := [32]string{}
  2304  	fsa := [32]string{}
  2305  	var fts []string
  2306  	var isMatch func(subj string) bool
  2307  	// Decide to build.
  2308  	if wc {
  2309  		fts = tokenizeSubjectIntoSlice(fsa[:0], filter)
  2310  		isMatch = func(subj string) bool {
  2311  			tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
  2312  			return isSubsetMatchTokenized(tts, fts)
  2313  		}
  2314  	}
  2315  	// Only do linear scan if isAll or we are wildcarded and have to traverse more fss than actual messages.
  2316  	doLinearScan := isAll || (wc && len(mb.fss) > int(lseq-fseq))
  2317  
  2318  	if !doLinearScan {
  2319  		// If we have a wildcard match against all tracked subjects we know about.
  2320  		if wc {
  2321  			subs = subs[:0]
  2322  			for subj := range mb.fss {
  2323  				if isMatch(subj) {
  2324  					subs = append(subs, subj)
  2325  				}
  2326  			}
  2327  			// Check if we matched anything
  2328  			if len(subs) == 0 {
  2329  				return nil, didLoad, ErrStoreMsgNotFound
  2330  			}
  2331  		}
  2332  		fseq = lseq + 1
  2333  		for _, subj := range subs {
  2334  			ss := mb.fss[subj]
  2335  			if ss != nil && ss.firstNeedsUpdate {
  2336  				mb.recalculateFirstForSubj(subj, ss.First, ss)
  2337  			}
  2338  			if ss == nil || start > ss.Last || ss.First >= fseq {
  2339  				continue
  2340  			}
  2341  			if ss.First < start {
  2342  				fseq = start
  2343  			} else {
  2344  				fseq = ss.First
  2345  			}
  2346  		}
  2347  	}
  2348  
  2349  	if fseq > lseq {
  2350  		return nil, didLoad, ErrStoreMsgNotFound
  2351  	}
  2352  
  2353  	// If we guess to not do a linear scan, but the above resulted in alot of subs that will
  2354  	// need to be checked for every scanned message, revert.
  2355  	// TODO(dlc) - we could memoize the subs across calls.
  2356  	if len(subs) > int(lseq-fseq) {
  2357  		doLinearScan = true
  2358  	}
  2359  
  2360  	// Need messages loaded from here on out.
  2361  	if mb.cacheNotLoaded() {
  2362  		if err := mb.loadMsgsWithLock(); err != nil {
  2363  			return nil, false, err
  2364  		}
  2365  		didLoad = true
  2366  	}
  2367  
  2368  	if sm == nil {
  2369  		sm = new(StoreMsg)
  2370  	}
  2371  
  2372  	for seq := fseq; seq <= lseq; seq++ {
  2373  		llseq := mb.llseq
  2374  		fsm, err := mb.cacheLookup(seq, sm)
  2375  		if err != nil {
  2376  			if err == errPartialCache || err == errNoCache {
  2377  				return nil, false, err
  2378  			}
  2379  			continue
  2380  		}
  2381  		expireOk := seq == lseq && mb.llseq == seq
  2382  		if isAll {
  2383  			return fsm, expireOk, nil
  2384  		}
  2385  		if doLinearScan {
  2386  			if wc && isMatch(sm.subj) {
  2387  				return fsm, expireOk, nil
  2388  			} else if !wc && fsm.subj == filter {
  2389  				return fsm, expireOk, nil
  2390  			}
  2391  		} else {
  2392  			for _, subj := range subs {
  2393  				if fsm.subj == subj {
  2394  					return fsm, expireOk, nil
  2395  				}
  2396  			}
  2397  		}
  2398  		// If we are here we did not match, so put the llseq back.
  2399  		mb.llseq = llseq
  2400  	}
  2401  
  2402  	return nil, didLoad, ErrStoreMsgNotFound
  2403  }
  2404  
  2405  // This will traverse a message block and generate the filtered pending.
  2406  func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) {
  2407  	mb.mu.Lock()
  2408  	defer mb.mu.Unlock()
  2409  	return mb.filteredPendingLocked(subj, wc, seq)
  2410  }
  2411  
  2412  // This will traverse a message block and generate the filtered pending.
  2413  // Lock should be held.
  2414  func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) {
  2415  	isAll := filter == _EMPTY_ || filter == fwcs
  2416  
  2417  	// First check if we can optimize this part.
  2418  	// This means we want all and the starting sequence was before this block.
  2419  	if isAll {
  2420  		if fseq := atomic.LoadUint64(&mb.first.seq); sseq <= fseq {
  2421  			return mb.msgs, fseq, atomic.LoadUint64(&mb.last.seq)
  2422  		}
  2423  	}
  2424  
  2425  	update := func(ss *SimpleState) {
  2426  		total += ss.Msgs
  2427  		if first == 0 || ss.First < first {
  2428  			first = ss.First
  2429  		}
  2430  		if ss.Last > last {
  2431  			last = ss.Last
  2432  		}
  2433  	}
  2434  
  2435  	// Make sure we have fss loaded.
  2436  	mb.ensurePerSubjectInfoLoaded()
  2437  
  2438  	tsa := [32]string{}
  2439  	fsa := [32]string{}
  2440  	fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
  2441  
  2442  	// 1. See if we match any subs from fss.
  2443  	// 2. If we match and the sseq is past ss.Last then we can use meta only.
  2444  	// 3. If we match and we need to do a partial, break and clear any totals and do a full scan like num pending.
  2445  
  2446  	isMatch := func(subj string) bool {
  2447  		if !wc {
  2448  			return subj == filter
  2449  		}
  2450  		tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
  2451  		return isSubsetMatchTokenized(tts, fts)
  2452  	}
  2453  
  2454  	var havePartial bool
  2455  	for subj, ss := range mb.fss {
  2456  		if isAll || isMatch(subj) {
  2457  			if ss.firstNeedsUpdate {
  2458  				mb.recalculateFirstForSubj(subj, ss.First, ss)
  2459  			}
  2460  			if sseq <= ss.First {
  2461  				update(ss)
  2462  			} else if sseq <= ss.Last {
  2463  				// We matched but its a partial.
  2464  				havePartial = true
  2465  				break
  2466  			}
  2467  		}
  2468  	}
  2469  
  2470  	// If we did not encounter any partials we can return here.
  2471  	if !havePartial {
  2472  		return total, first, last
  2473  	}
  2474  
  2475  	// If we are here we need to scan the msgs.
  2476  	// Clear what we had.
  2477  	total, first, last = 0, 0, 0
  2478  
  2479  	// If we load the cache for a linear scan we want to expire that cache upon exit.
  2480  	var shouldExpire bool
  2481  	if mb.cacheNotLoaded() {
  2482  		mb.loadMsgsWithLock()
  2483  		shouldExpire = true
  2484  	}
  2485  
  2486  	var smv StoreMsg
  2487  	for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ {
  2488  		sm, _ := mb.cacheLookup(seq, &smv)
  2489  		if sm == nil {
  2490  			continue
  2491  		}
  2492  		if isAll || isMatch(sm.subj) {
  2493  			total++
  2494  			if first == 0 || seq < first {
  2495  				first = seq
  2496  			}
  2497  			if seq > last {
  2498  				last = seq
  2499  			}
  2500  		}
  2501  	}
  2502  	// If we loaded this block for this operation go ahead and expire it here.
  2503  	if shouldExpire {
  2504  		mb.tryForceExpireCacheLocked()
  2505  	}
  2506  
  2507  	return total, first, last
  2508  }
  2509  
  2510  // FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence.
  2511  func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState {
  2512  	fs.mu.RLock()
  2513  	defer fs.mu.RUnlock()
  2514  
  2515  	lseq := fs.state.LastSeq
  2516  	if sseq < fs.state.FirstSeq {
  2517  		sseq = fs.state.FirstSeq
  2518  	}
  2519  
  2520  	// Returned state.
  2521  	var ss SimpleState
  2522  
  2523  	// If past the end no results.
  2524  	if sseq > lseq {
  2525  		// Make sure we track sequences
  2526  		ss.First = fs.state.FirstSeq
  2527  		ss.Last = fs.state.LastSeq
  2528  		return ss
  2529  	}
  2530  
  2531  	// If we want all msgs that match we can shortcircuit.
  2532  	// TODO(dlc) - This can be extended for all cases but would
  2533  	// need to be careful on total msgs calculations etc.
  2534  	if sseq == fs.state.FirstSeq {
  2535  		fs.numFilteredPending(subj, &ss)
  2536  	} else {
  2537  		wc := subjectHasWildcard(subj)
  2538  		// Tracking subject state.
  2539  		// TODO(dlc) - Optimize for 2.10 with avl tree and no atomics per block.
  2540  		for _, mb := range fs.blks {
  2541  			// Skip blocks that are less than our starting sequence.
  2542  			if sseq > atomic.LoadUint64(&mb.last.seq) {
  2543  				continue
  2544  			}
  2545  			t, f, l := mb.filteredPending(subj, wc, sseq)
  2546  			ss.Msgs += t
  2547  			if ss.First == 0 || (f > 0 && f < ss.First) {
  2548  				ss.First = f
  2549  			}
  2550  			if l > ss.Last {
  2551  				ss.Last = l
  2552  			}
  2553  		}
  2554  	}
  2555  
  2556  	return ss
  2557  }
  2558  
  2559  // Optimized way for getting all num pending matching a filter subject.
  2560  // Lock should be held.
  2561  func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
  2562  	isAll := filter == _EMPTY_ || filter == fwcs
  2563  
  2564  	// If isAll we do not need to do anything special to calculate the first and last and total.
  2565  	if isAll {
  2566  		ss.First = fs.state.FirstSeq
  2567  		ss.Last = fs.state.LastSeq
  2568  		ss.Msgs = fs.state.Msgs
  2569  		return
  2570  	}
  2571  
  2572  	start, stop := uint32(math.MaxUint32), uint32(0)
  2573  	fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) {
  2574  		ss.Msgs += psi.total
  2575  		// Keep track of start and stop indexes for this subject.
  2576  		if psi.fblk < start {
  2577  			start = psi.fblk
  2578  		}
  2579  		if psi.lblk > stop {
  2580  			stop = psi.lblk
  2581  		}
  2582  	})
  2583  	// We do need to figure out the first and last sequences.
  2584  	wc := subjectHasWildcard(filter)
  2585  	// Do start
  2586  	mb := fs.bim[start]
  2587  	if mb != nil {
  2588  		_, f, _ := mb.filteredPending(filter, wc, 0)
  2589  		ss.First = f
  2590  	}
  2591  	if ss.First == 0 {
  2592  		// This is a miss. This can happen since psi.fblk is lazy, but should be very rare.
  2593  		for i := start + 1; i <= stop; i++ {
  2594  			mb := fs.bim[i]
  2595  			if mb == nil {
  2596  				continue
  2597  			}
  2598  			if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 {
  2599  				ss.First = f
  2600  				break
  2601  			}
  2602  		}
  2603  	}
  2604  	// Now last
  2605  	if mb = fs.bim[stop]; mb != nil {
  2606  		_, _, l := mb.filteredPending(filter, wc, 0)
  2607  		ss.Last = l
  2608  	}
  2609  }
  2610  
  2611  // SubjectsState returns a map of SimpleState for all matching subjects.
  2612  func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
  2613  	fs.mu.RLock()
  2614  	defer fs.mu.RUnlock()
  2615  
  2616  	if fs.state.Msgs == 0 || fs.noTrackSubjects() {
  2617  		return nil
  2618  	}
  2619  
  2620  	start, stop := fs.blks[0], fs.lmb
  2621  	// We can short circuit if not a wildcard using psim for start and stop.
  2622  	if !subjectHasWildcard(subject) {
  2623  		info, ok := fs.psim.Find(stringToBytes(subject))
  2624  		if !ok {
  2625  			return nil
  2626  		}
  2627  		start, stop = fs.bim[info.fblk], fs.bim[info.lblk]
  2628  	}
  2629  
  2630  	// Aggregate fss.
  2631  	fss := make(map[string]SimpleState)
  2632  	var startFound bool
  2633  
  2634  	for _, mb := range fs.blks {
  2635  		if !startFound {
  2636  			if mb != start {
  2637  				continue
  2638  			}
  2639  			startFound = true
  2640  		}
  2641  
  2642  		mb.mu.Lock()
  2643  		var shouldExpire bool
  2644  		if mb.fssNotLoaded() {
  2645  			// Make sure we have fss loaded.
  2646  			mb.loadMsgsWithLock()
  2647  			shouldExpire = true
  2648  		}
  2649  		for subj, ss := range mb.fss {
  2650  			if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) {
  2651  				if ss.firstNeedsUpdate {
  2652  					mb.recalculateFirstForSubj(subj, ss.First, ss)
  2653  				}
  2654  				oss := fss[subj]
  2655  				if oss.First == 0 { // New
  2656  					fss[subj] = *ss
  2657  				} else {
  2658  					// Merge here.
  2659  					oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs
  2660  					fss[subj] = oss
  2661  				}
  2662  			}
  2663  		}
  2664  		if shouldExpire {
  2665  			// Expire this cache before moving on.
  2666  			mb.tryForceExpireCacheLocked()
  2667  		}
  2668  		mb.mu.Unlock()
  2669  
  2670  		if mb == stop {
  2671  			break
  2672  		}
  2673  	}
  2674  
  2675  	return fss
  2676  }
  2677  
  2678  // MultiLastSeqs will return a sorted list of sequences that match all subjects presented in filters.
  2679  // We will not exceed the maxSeq, which if 0 becomes the store's last sequence.
  2680  func (fs *fileStore) MultiLastSeqs(filters []string, maxSeq uint64, maxAllowed int) ([]uint64, error) {
  2681  	fs.mu.RLock()
  2682  	defer fs.mu.RUnlock()
  2683  
  2684  	if fs.state.Msgs == 0 || fs.noTrackSubjects() {
  2685  		return nil, nil
  2686  	}
  2687  
  2688  	lastBlkIndex := len(fs.blks) - 1
  2689  	lastMB := fs.blks[lastBlkIndex]
  2690  
  2691  	// Implied last sequence.
  2692  	if maxSeq == 0 {
  2693  		maxSeq = fs.state.LastSeq
  2694  	} else {
  2695  		// Udate last mb index if not last seq.
  2696  		lastBlkIndex, lastMB = fs.selectMsgBlockWithIndex(maxSeq)
  2697  	}
  2698  	//Make sure non-nil
  2699  	if lastMB == nil {
  2700  		return nil, nil
  2701  	}
  2702  
  2703  	// Grab our last mb index (not same as blk index).
  2704  	lastMB.mu.RLock()
  2705  	lastMBIndex := lastMB.index
  2706  	lastMB.mu.RUnlock()
  2707  
  2708  	subs := make(map[string]*psi)
  2709  	ltSeen := make(map[string]uint32)
  2710  	for _, filter := range filters {
  2711  		fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) {
  2712  			s := string(subj)
  2713  			subs[s] = psi
  2714  			if psi.lblk < lastMBIndex {
  2715  				ltSeen[s] = psi.lblk
  2716  			}
  2717  		})
  2718  	}
  2719  
  2720  	// If all subjects have a lower last index, select the largest for our walk backwards.
  2721  	if len(ltSeen) == len(subs) {
  2722  		max := uint32(0)
  2723  		for _, mbi := range ltSeen {
  2724  			if mbi > max {
  2725  				max = mbi
  2726  			}
  2727  		}
  2728  		lastMB = fs.bim[max]
  2729  	}
  2730  
  2731  	// Collect all sequences needed.
  2732  	seqs := make([]uint64, 0, len(subs))
  2733  	for i, lnf := lastBlkIndex, false; i >= 0; i-- {
  2734  		if len(subs) == 0 {
  2735  			break
  2736  		}
  2737  		mb := fs.blks[i]
  2738  		if !lnf {
  2739  			if mb != lastMB {
  2740  				continue
  2741  			}
  2742  			lnf = true
  2743  		}
  2744  		// We can start properly looking here.
  2745  		mb.mu.Lock()
  2746  		mb.ensurePerSubjectInfoLoaded()
  2747  		for subj, psi := range subs {
  2748  			if ss := mb.fss[subj]; ss != nil {
  2749  				if ss.Last <= maxSeq {
  2750  					seqs = append(seqs, ss.Last)
  2751  					delete(subs, subj)
  2752  				} else {
  2753  					// Need to search for it since last is > maxSeq.
  2754  					if mb.cacheNotLoaded() {
  2755  						mb.loadMsgsWithLock()
  2756  					}
  2757  					var smv StoreMsg
  2758  					fseq := atomic.LoadUint64(&mb.first.seq)
  2759  					for seq := maxSeq; seq >= fseq; seq-- {
  2760  						sm, _ := mb.cacheLookup(seq, &smv)
  2761  						if sm == nil || sm.subj != subj {
  2762  							continue
  2763  						}
  2764  						seqs = append(seqs, sm.seq)
  2765  						delete(subs, subj)
  2766  						break
  2767  					}
  2768  				}
  2769  			} else if mb.index <= psi.fblk {
  2770  				// Track which subs are no longer applicable, meaning we will not find a valid msg at this point.
  2771  				delete(subs, subj)
  2772  			}
  2773  			// TODO(dlc) we could track lblk like above in case some subs are very far apart.
  2774  			// Not too bad if fss loaded since we will skip over quickly with it loaded, but might be worth it.
  2775  		}
  2776  		mb.mu.Unlock()
  2777  
  2778  		// If maxAllowed was sepcified check that we will not exceed that.
  2779  		if maxAllowed > 0 && len(seqs) > maxAllowed {
  2780  			return nil, ErrTooManyResults
  2781  		}
  2782  
  2783  	}
  2784  	if len(seqs) == 0 {
  2785  		return nil, nil
  2786  	}
  2787  	sort.Slice(seqs, func(i, j int) bool { return seqs[i] < seqs[j] })
  2788  	return seqs, nil
  2789  }
  2790  
  2791  // NumPending will return the number of pending messages matching the filter subject starting at sequence.
  2792  // Optimized for stream num pending calculations for consumers.
  2793  func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) {
  2794  	fs.mu.RLock()
  2795  	defer fs.mu.RUnlock()
  2796  
  2797  	// This can always be last for these purposes.
  2798  	validThrough = fs.state.LastSeq
  2799  
  2800  	if fs.state.Msgs == 0 || sseq > fs.state.LastSeq {
  2801  		return 0, validThrough
  2802  	}
  2803  
  2804  	// Track starting for both block for the sseq and staring block that matches any subject.
  2805  	var seqStart int
  2806  	// See if we need to figure out starting block per sseq.
  2807  	if sseq > fs.state.FirstSeq {
  2808  		// This should not, but can return -1, so make sure we check to avoid panic below.
  2809  		if seqStart, _ = fs.selectMsgBlockWithIndex(sseq); seqStart < 0 {
  2810  			seqStart = 0
  2811  		}
  2812  	}
  2813  
  2814  	isAll := filter == _EMPTY_ || filter == fwcs
  2815  	wc := subjectHasWildcard(filter)
  2816  
  2817  	// See if filter was provided but its the only subject.
  2818  	if !isAll && !wc && fs.psim.Size() == 1 {
  2819  		if _, ok := fs.psim.Find(stringToBytes(filter)); ok {
  2820  			isAll = true
  2821  		}
  2822  	}
  2823  	if isAll && filter == _EMPTY_ {
  2824  		filter = fwcs
  2825  	}
  2826  	// If we are isAll and have no deleted we can do a simpler calculation.
  2827  	if !lastPerSubject && isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs {
  2828  		if sseq == 0 {
  2829  			return fs.state.Msgs, validThrough
  2830  		}
  2831  		return fs.state.LastSeq - sseq + 1, validThrough
  2832  	}
  2833  
  2834  	var tsa, fsa [32]string
  2835  	fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
  2836  
  2837  	isMatch := func(subj string) bool {
  2838  		if isAll {
  2839  			return true
  2840  		}
  2841  		if !wc {
  2842  			return subj == filter
  2843  		}
  2844  		tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
  2845  		return isSubsetMatchTokenized(tts, fts)
  2846  	}
  2847  
  2848  	// Handle last by subject a bit differently.
  2849  	// We will scan PSIM since we accurately track the last block we have seen the subject in. This
  2850  	// allows us to only need to load at most one block now.
  2851  	// For the last block, we need to track the subjects that we know are in that block, and track seen
  2852  	// while in the block itself, but complexity there worth it.
  2853  	if lastPerSubject {
  2854  		// If we want all and our start sequence is equal or less than first return number of subjects.
  2855  		if isAll && sseq <= fs.state.FirstSeq {
  2856  			return uint64(fs.psim.Size()), validThrough
  2857  		}
  2858  		// If we are here we need to scan. We are going to scan the PSIM looking for lblks that are >= seqStart.
  2859  		// This will build up a list of all subjects from the selected block onward.
  2860  		lbm := make(map[string]bool)
  2861  		mb := fs.blks[seqStart]
  2862  		bi := mb.index
  2863  
  2864  		fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) {
  2865  			// If the select blk start is greater than entry's last blk skip.
  2866  			if bi > psi.lblk {
  2867  				return
  2868  			}
  2869  			total++
  2870  			// We will track the subjects that are an exact match to the last block.
  2871  			// This is needed for last block processing.
  2872  			if psi.lblk == bi {
  2873  				lbm[string(subj)] = true
  2874  			}
  2875  		})
  2876  
  2877  		// Now check if we need to inspect the seqStart block.
  2878  		// Grab write lock in case we need to load in msgs.
  2879  		mb.mu.Lock()
  2880  		var shouldExpire bool
  2881  		// We need to walk this block to correct accounting from above.
  2882  		if sseq > mb.first.seq {
  2883  			// Track the ones we add back in case more than one.
  2884  			seen := make(map[string]bool)
  2885  			// We need to discount the total by subjects seen before sseq, but also add them right back in if they are >= sseq for this blk.
  2886  			// This only should be subjects we know have the last blk in this block.
  2887  			if mb.cacheNotLoaded() {
  2888  				mb.loadMsgsWithLock()
  2889  				shouldExpire = true
  2890  			}
  2891  			var smv StoreMsg
  2892  			for seq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ {
  2893  				sm, _ := mb.cacheLookup(seq, &smv)
  2894  				if sm == nil || sm.subj == _EMPTY_ || !lbm[sm.subj] {
  2895  					continue
  2896  				}
  2897  				if isMatch(sm.subj) {
  2898  					// If less than sseq adjust off of total as long as this subject matched the last block.
  2899  					if seq < sseq {
  2900  						if !seen[sm.subj] {
  2901  							total--
  2902  							seen[sm.subj] = true
  2903  						}
  2904  					} else if seen[sm.subj] {
  2905  						// This is equal or more than sseq, so add back in.
  2906  						total++
  2907  						// Make sure to not process anymore.
  2908  						delete(seen, sm.subj)
  2909  					}
  2910  				}
  2911  			}
  2912  		}
  2913  		// If we loaded the block try to force expire.
  2914  		if shouldExpire {
  2915  			mb.tryForceExpireCacheLocked()
  2916  		}
  2917  		mb.mu.Unlock()
  2918  		return total, validThrough
  2919  	}
  2920  
  2921  	// If we would need to scan more from the beginning, revert back to calculating directly here.
  2922  	// TODO(dlc) - Redo properly with sublists etc for subject-based filtering.
  2923  	if seqStart >= (len(fs.blks) / 2) {
  2924  		for i := seqStart; i < len(fs.blks); i++ {
  2925  			var shouldExpire bool
  2926  			mb := fs.blks[i]
  2927  			// Hold write lock in case we need to load cache.
  2928  			mb.mu.Lock()
  2929  			var t uint64
  2930  			if isAll && sseq <= atomic.LoadUint64(&mb.first.seq) {
  2931  				total += mb.msgs
  2932  				mb.mu.Unlock()
  2933  				continue
  2934  			}
  2935  			// If we are here we need to at least scan the subject fss.
  2936  			// Make sure we have fss loaded.
  2937  			if mb.fssNotLoaded() {
  2938  				mb.loadMsgsWithLock()
  2939  				shouldExpire = true
  2940  			}
  2941  			var havePartial bool
  2942  			for subj, ss := range mb.fss {
  2943  				if isMatch(subj) {
  2944  					if ss.firstNeedsUpdate {
  2945  						mb.recalculateFirstForSubj(subj, ss.First, ss)
  2946  					}
  2947  					if sseq <= ss.First {
  2948  						t += ss.Msgs
  2949  					} else if sseq <= ss.Last {
  2950  						// We matched but its a partial.
  2951  						havePartial = true
  2952  						break
  2953  					}
  2954  				}
  2955  			}
  2956  			// See if we need to scan msgs here.
  2957  			if havePartial {
  2958  				// Make sure we have the cache loaded.
  2959  				if mb.cacheNotLoaded() {
  2960  					mb.loadMsgsWithLock()
  2961  					shouldExpire = true
  2962  				}
  2963  				// Clear on partial.
  2964  				t = 0
  2965  				var smv StoreMsg
  2966  				for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ {
  2967  					if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && isMatch(sm.subj) {
  2968  						t++
  2969  					}
  2970  				}
  2971  			}
  2972  			// If we loaded this block for this operation go ahead and expire it here.
  2973  			if shouldExpire {
  2974  				mb.tryForceExpireCacheLocked()
  2975  			}
  2976  			mb.mu.Unlock()
  2977  			total += t
  2978  		}
  2979  		return total, validThrough
  2980  	}
  2981  
  2982  	// If we are here it's better to calculate totals from psim and adjust downward by scanning less blocks.
  2983  	// TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead.
  2984  	start := uint32(math.MaxUint32)
  2985  	fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) {
  2986  		total += psi.total
  2987  		// Keep track of start index for this subject.
  2988  		if psi.fblk < start {
  2989  			start = psi.fblk
  2990  		}
  2991  	})
  2992  	// See if we were asked for all, if so we are done.
  2993  	if sseq <= fs.state.FirstSeq {
  2994  		return total, validThrough
  2995  	}
  2996  
  2997  	// If we are here we need to calculate partials for the first blocks.
  2998  	firstSubjBlk := fs.bim[start]
  2999  	var firstSubjBlkFound bool
  3000  	// Adjust in case not found.
  3001  	if firstSubjBlk == nil {
  3002  		firstSubjBlkFound = true
  3003  	}
  3004  
  3005  	// Track how many we need to adjust against the total.
  3006  	var adjust uint64
  3007  	for i := 0; i <= seqStart; i++ {
  3008  		mb := fs.blks[i]
  3009  		// We can skip blks if we know they are below the first one that has any subject matches.
  3010  		if !firstSubjBlkFound {
  3011  			if firstSubjBlkFound = (mb == firstSubjBlk); !firstSubjBlkFound {
  3012  				continue
  3013  			}
  3014  		}
  3015  		// We need to scan this block.
  3016  		var shouldExpire bool
  3017  		mb.mu.Lock()
  3018  		// Check if we should include all of this block in adjusting. If so work with metadata.
  3019  		if sseq > atomic.LoadUint64(&mb.last.seq) {
  3020  			if isAll {
  3021  				adjust += mb.msgs
  3022  			} else {
  3023  				// We need to adjust for all matches in this block.
  3024  				// Make sure we have fss loaded. This loads whole block now.
  3025  				if mb.fssNotLoaded() {
  3026  					mb.loadMsgsWithLock()
  3027  					shouldExpire = true
  3028  				}
  3029  				for subj, ss := range mb.fss {
  3030  					if isMatch(subj) {
  3031  						adjust += ss.Msgs
  3032  					}
  3033  				}
  3034  			}
  3035  		} else {
  3036  			// This is the last block. We need to scan per message here.
  3037  			if mb.cacheNotLoaded() {
  3038  				mb.loadMsgsWithLock()
  3039  				shouldExpire = true
  3040  			}
  3041  			var last = atomic.LoadUint64(&mb.last.seq)
  3042  			if sseq < last {
  3043  				last = sseq
  3044  			}
  3045  			// We need to walk all messages in this block
  3046  			var smv StoreMsg
  3047  			for seq := atomic.LoadUint64(&mb.first.seq); seq < last; seq++ {
  3048  				sm, _ := mb.cacheLookup(seq, &smv)
  3049  				if sm == nil || sm.subj == _EMPTY_ {
  3050  					continue
  3051  				}
  3052  				// Check if it matches our filter.
  3053  				if sm.seq < sseq && isMatch(sm.subj) {
  3054  					adjust++
  3055  				}
  3056  			}
  3057  		}
  3058  		// If we loaded the block try to force expire.
  3059  		if shouldExpire {
  3060  			mb.tryForceExpireCacheLocked()
  3061  		}
  3062  		mb.mu.Unlock()
  3063  	}
  3064  	// Make final adjustment.
  3065  	total -= adjust
  3066  
  3067  	return total, validThrough
  3068  }
  3069  
  3070  // SubjectsTotal return message totals per subject.
  3071  func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 {
  3072  	fs.mu.RLock()
  3073  	defer fs.mu.RUnlock()
  3074  
  3075  	if fs.psim.Size() == 0 {
  3076  		return nil
  3077  	}
  3078  	// Match all if no filter given.
  3079  	if filter == _EMPTY_ {
  3080  		filter = fwcs
  3081  	}
  3082  	fst := make(map[string]uint64)
  3083  	fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) {
  3084  		fst[string(subj)] = psi.total
  3085  	})
  3086  	return fst
  3087  }
  3088  
  3089  // RegisterStorageUpdates registers a callback for updates to storage changes.
  3090  // It will present number of messages and bytes as a signed integer and an
  3091  // optional sequence number of the message if a single.
  3092  func (fs *fileStore) RegisterStorageUpdates(cb StorageUpdateHandler) {
  3093  	fs.mu.Lock()
  3094  	fs.scb = cb
  3095  	bsz := fs.state.Bytes
  3096  	fs.mu.Unlock()
  3097  	if cb != nil && bsz > 0 {
  3098  		cb(0, int64(bsz), 0, _EMPTY_)
  3099  	}
  3100  }
  3101  
  3102  // Helper to get hash key for specific message block.
  3103  // Lock should be held
  3104  func (fs *fileStore) hashKeyForBlock(index uint32) []byte {
  3105  	return []byte(fmt.Sprintf("%s-%d", fs.cfg.Name, index))
  3106  }
  3107  
  3108  func (mb *msgBlock) setupWriteCache(buf []byte) {
  3109  	// Make sure we have a cache setup.
  3110  	if mb.cache != nil {
  3111  		return
  3112  	}
  3113  
  3114  	// Setup simple cache.
  3115  	mb.cache = &cache{buf: buf}
  3116  	// Make sure we set the proper cache offset if we have existing data.
  3117  	var fi os.FileInfo
  3118  	if mb.mfd != nil {
  3119  		fi, _ = mb.mfd.Stat()
  3120  	} else if mb.mfn != _EMPTY_ {
  3121  		fi, _ = os.Stat(mb.mfn)
  3122  	}
  3123  	if fi != nil {
  3124  		mb.cache.off = int(fi.Size())
  3125  	}
  3126  	mb.llts = time.Now().UnixNano()
  3127  	mb.startCacheExpireTimer()
  3128  }
  3129  
  3130  // This rolls to a new append msg block.
  3131  // Lock should be held.
  3132  func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) {
  3133  	index := uint32(1)
  3134  	var rbuf []byte
  3135  
  3136  	if lmb := fs.lmb; lmb != nil {
  3137  		index = lmb.index + 1
  3138  		// Determine if we can reclaim any resources here.
  3139  		if fs.fip {
  3140  			lmb.mu.Lock()
  3141  			lmb.closeFDsLocked()
  3142  			if lmb.cache != nil {
  3143  				// Reset write timestamp and see if we can expire this cache.
  3144  				rbuf = lmb.tryExpireWriteCache()
  3145  			}
  3146  			lmb.mu.Unlock()
  3147  		}
  3148  	}
  3149  
  3150  	mb := fs.initMsgBlock(index)
  3151  	// Lock should be held to quiet race detector.
  3152  	mb.mu.Lock()
  3153  	mb.setupWriteCache(rbuf)
  3154  	mb.fss = make(map[string]*SimpleState)
  3155  
  3156  	// Set cache time to creation time to start.
  3157  	ts := time.Now().UnixNano()
  3158  	mb.llts, mb.lwts = 0, ts
  3159  	// Remember our last sequence number.
  3160  	atomic.StoreUint64(&mb.first.seq, fs.state.LastSeq+1)
  3161  	atomic.StoreUint64(&mb.last.seq, fs.state.LastSeq)
  3162  	mb.mu.Unlock()
  3163  
  3164  	// Now do local hash.
  3165  	key := sha256.Sum256(fs.hashKeyForBlock(index))
  3166  	hh, err := highwayhash.New64(key[:])
  3167  	if err != nil {
  3168  		return nil, fmt.Errorf("could not create hash: %v", err)
  3169  	}
  3170  	mb.hh = hh
  3171  
  3172  	<-dios
  3173  	mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
  3174  	dios <- struct{}{}
  3175  
  3176  	if err != nil {
  3177  		mb.dirtyCloseWithRemove(true)
  3178  		return nil, fmt.Errorf("Error creating msg block file: %v", err)
  3179  	}
  3180  	mb.mfd = mfd
  3181  
  3182  	// Check if encryption is enabled.
  3183  	if fs.prf != nil {
  3184  		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
  3185  			return nil, err
  3186  		}
  3187  	}
  3188  
  3189  	// If we know we will need this so go ahead and spin up.
  3190  	if !fs.fip {
  3191  		mb.spinUpFlushLoop()
  3192  	}
  3193  
  3194  	// Add to our list of blocks and mark as last.
  3195  	fs.addMsgBlock(mb)
  3196  
  3197  	return mb, nil
  3198  }
  3199  
  3200  // Generate the keys for this message block and write them out.
  3201  func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error {
  3202  	if mb == nil {
  3203  		return nil
  3204  	}
  3205  	key, bek, seed, encrypted, err := fs.genEncryptionKeys(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))
  3206  	if err != nil {
  3207  		return err
  3208  	}
  3209  	mb.aek, mb.bek, mb.seed, mb.nonce = key, bek, seed, encrypted[:key.NonceSize()]
  3210  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  3211  	keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
  3212  	if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
  3213  		return err
  3214  	}
  3215  	<-dios
  3216  	err = os.WriteFile(keyFile, encrypted, defaultFilePerms)
  3217  	dios <- struct{}{}
  3218  	if err != nil {
  3219  		return err
  3220  	}
  3221  	mb.kfn = keyFile
  3222  	return nil
  3223  }
  3224  
  3225  // Stores a raw message with expected sequence number and timestamp.
  3226  // Lock should be held.
  3227  func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) (err error) {
  3228  	if fs.closed {
  3229  		return ErrStoreClosed
  3230  	}
  3231  
  3232  	// Per subject max check needed.
  3233  	mmp := uint64(fs.cfg.MaxMsgsPer)
  3234  	var psmc uint64
  3235  	psmax := mmp > 0 && len(subj) > 0
  3236  	if psmax {
  3237  		if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
  3238  			psmc = info.total
  3239  		}
  3240  	}
  3241  
  3242  	var fseq uint64
  3243  	// Check if we are discarding new messages when we reach the limit.
  3244  	if fs.cfg.Discard == DiscardNew {
  3245  		var asl bool
  3246  		if psmax && psmc >= mmp {
  3247  			// If we are instructed to discard new per subject, this is an error.
  3248  			if fs.cfg.DiscardNewPer {
  3249  				return ErrMaxMsgsPerSubject
  3250  			}
  3251  			if fseq, err = fs.firstSeqForSubj(subj); err != nil {
  3252  				return err
  3253  			}
  3254  			asl = true
  3255  		}
  3256  		// If we are discard new and limits policy and clustered, we do the enforcement
  3257  		// above and should not disqualify the message here since it could cause replicas to drift.
  3258  		if fs.cfg.Retention == LimitsPolicy || fs.cfg.Replicas == 1 {
  3259  			if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl {
  3260  				return ErrMaxMsgs
  3261  			}
  3262  			if fs.cfg.MaxBytes > 0 && fs.state.Bytes+fileStoreMsgSize(subj, hdr, msg) >= uint64(fs.cfg.MaxBytes) {
  3263  				if !asl || fs.sizeForSeq(fseq) <= int(fileStoreMsgSize(subj, hdr, msg)) {
  3264  					return ErrMaxBytes
  3265  				}
  3266  			}
  3267  		}
  3268  	}
  3269  
  3270  	// Check sequence.
  3271  	if seq != fs.state.LastSeq+1 {
  3272  		if seq > 0 {
  3273  			return ErrSequenceMismatch
  3274  		}
  3275  		seq = fs.state.LastSeq + 1
  3276  	}
  3277  
  3278  	// Write msg record.
  3279  	n, err := fs.writeMsgRecord(seq, ts, subj, hdr, msg)
  3280  	if err != nil {
  3281  		return err
  3282  	}
  3283  
  3284  	// Adjust top level tracking of per subject msg counts.
  3285  	if len(subj) > 0 && fs.psim != nil {
  3286  		index := fs.lmb.index
  3287  		if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
  3288  			info.total++
  3289  			if index > info.lblk {
  3290  				info.lblk = index
  3291  			}
  3292  		} else {
  3293  			fs.psim.Insert(stringToBytes(subj), psi{total: 1, fblk: index, lblk: index})
  3294  			fs.tsl += len(subj)
  3295  		}
  3296  	}
  3297  
  3298  	// Adjust first if needed.
  3299  	now := time.Unix(0, ts).UTC()
  3300  	if fs.state.Msgs == 0 {
  3301  		fs.state.FirstSeq = seq
  3302  		fs.state.FirstTime = now
  3303  	}
  3304  
  3305  	fs.state.Msgs++
  3306  	fs.state.Bytes += n
  3307  	fs.state.LastSeq = seq
  3308  	fs.state.LastTime = now
  3309  
  3310  	// Enforce per message limits.
  3311  	// We snapshotted psmc before our actual write, so >= comparison needed.
  3312  	if psmax && psmc >= mmp {
  3313  		// We may have done this above.
  3314  		if fseq == 0 {
  3315  			fseq, _ = fs.firstSeqForSubj(subj)
  3316  		}
  3317  		if ok, _ := fs.removeMsgViaLimits(fseq); ok {
  3318  			// Make sure we are below the limit.
  3319  			if psmc--; psmc >= mmp {
  3320  				bsubj := stringToBytes(subj)
  3321  				for info, ok := fs.psim.Find(bsubj); ok && info.total > mmp; info, ok = fs.psim.Find(bsubj) {
  3322  					if seq, _ := fs.firstSeqForSubj(subj); seq > 0 {
  3323  						if ok, _ := fs.removeMsgViaLimits(seq); !ok {
  3324  							break
  3325  						}
  3326  					} else {
  3327  						break
  3328  					}
  3329  				}
  3330  			}
  3331  		} else if mb := fs.selectMsgBlock(fseq); mb != nil {
  3332  			// If we are here we could not remove fseq from above, so rebuild.
  3333  			var ld *LostStreamData
  3334  			if ld, _, _ = mb.rebuildState(); ld != nil {
  3335  				fs.rebuildStateLocked(ld)
  3336  			}
  3337  		}
  3338  	}
  3339  
  3340  	// Limits checks and enforcement.
  3341  	// If they do any deletions they will update the
  3342  	// byte count on their own, so no need to compensate.
  3343  	fs.enforceMsgLimit()
  3344  	fs.enforceBytesLimit()
  3345  
  3346  	// Check if we have and need the age expiration timer running.
  3347  	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
  3348  		fs.startAgeChk()
  3349  	}
  3350  
  3351  	return nil
  3352  }
  3353  
  3354  // StoreRawMsg stores a raw message with expected sequence number and timestamp.
  3355  func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) error {
  3356  	fs.mu.Lock()
  3357  	err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
  3358  	cb := fs.scb
  3359  	// Check if first message timestamp requires expiry
  3360  	// sooner than initial replica expiry timer set to MaxAge when initializing.
  3361  	if !fs.receivedAny && fs.cfg.MaxAge != 0 && ts > 0 {
  3362  		fs.receivedAny = true
  3363  		// don't block here by calling expireMsgs directly.
  3364  		// Instead, set short timeout.
  3365  		fs.resetAgeChk(int64(time.Millisecond * 50))
  3366  	}
  3367  	fs.mu.Unlock()
  3368  
  3369  	if err == nil && cb != nil {
  3370  		cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
  3371  	}
  3372  
  3373  	return err
  3374  }
  3375  
  3376  // Store stores a message. We hold the main filestore lock for any write operation.
  3377  func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) {
  3378  	fs.mu.Lock()
  3379  	seq, ts := fs.state.LastSeq+1, time.Now().UnixNano()
  3380  	err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
  3381  	cb := fs.scb
  3382  	fs.mu.Unlock()
  3383  
  3384  	if err != nil {
  3385  		seq, ts = 0, 0
  3386  	} else if cb != nil {
  3387  		cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
  3388  	}
  3389  
  3390  	return seq, ts, err
  3391  }
  3392  
  3393  // skipMsg will update this message block for a skipped message.
  3394  // If we do not have any messages, just update the metadata, otherwise
  3395  // we will place an empty record marking the sequence as used. The
  3396  // sequence will be marked erased.
  3397  // fs lock should be held.
  3398  func (mb *msgBlock) skipMsg(seq uint64, now time.Time) {
  3399  	if mb == nil {
  3400  		return
  3401  	}
  3402  	var needsRecord bool
  3403  
  3404  	nowts := now.UnixNano()
  3405  
  3406  	mb.mu.Lock()
  3407  	// If we are empty can just do meta.
  3408  	if mb.msgs == 0 {
  3409  		atomic.StoreUint64(&mb.last.seq, seq)
  3410  		mb.last.ts = nowts
  3411  		atomic.StoreUint64(&mb.first.seq, seq+1)
  3412  		mb.first.ts = nowts
  3413  	} else {
  3414  		needsRecord = true
  3415  		mb.dmap.Insert(seq)
  3416  	}
  3417  	mb.mu.Unlock()
  3418  
  3419  	if needsRecord {
  3420  		mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true)
  3421  	} else {
  3422  		mb.kickFlusher()
  3423  	}
  3424  }
  3425  
  3426  // SkipMsg will use the next sequence number but not store anything.
  3427  func (fs *fileStore) SkipMsg() uint64 {
  3428  	fs.mu.Lock()
  3429  	defer fs.mu.Unlock()
  3430  
  3431  	// Grab our current last message block.
  3432  	mb := fs.lmb
  3433  	if mb == nil || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize {
  3434  		if mb != nil && fs.fcfg.Compression != NoCompression {
  3435  			// We've now reached the end of this message block, if we want
  3436  			// to compress blocks then now's the time to do it.
  3437  			go mb.recompressOnDiskIfNeeded()
  3438  		}
  3439  		var err error
  3440  		if mb, err = fs.newMsgBlockForWrite(); err != nil {
  3441  			return 0
  3442  		}
  3443  	}
  3444  
  3445  	// Grab time and last seq.
  3446  	now, seq := time.Now().UTC(), fs.state.LastSeq+1
  3447  
  3448  	// Write skip msg.
  3449  	mb.skipMsg(seq, now)
  3450  
  3451  	// Update fs state.
  3452  	fs.state.LastSeq, fs.state.LastTime = seq, now
  3453  	if fs.state.Msgs == 0 {
  3454  		fs.state.FirstSeq, fs.state.FirstTime = seq, now
  3455  	}
  3456  	if seq == fs.state.FirstSeq {
  3457  		fs.state.FirstSeq, fs.state.FirstTime = seq+1, now
  3458  	}
  3459  	// Mark as dirty for stream state.
  3460  	fs.dirty++
  3461  
  3462  	return seq
  3463  }
  3464  
  3465  // Skip multiple msgs. We will determine if we can fit into current lmb or we need to create a new block.
  3466  func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error {
  3467  	fs.mu.Lock()
  3468  	defer fs.mu.Unlock()
  3469  
  3470  	// Check sequence matches our last sequence.
  3471  	if seq != fs.state.LastSeq+1 {
  3472  		if seq > 0 {
  3473  			return ErrSequenceMismatch
  3474  		}
  3475  		seq = fs.state.LastSeq + 1
  3476  	}
  3477  
  3478  	// Limit number of dmap entries
  3479  	const maxDeletes = 64 * 1024
  3480  	mb := fs.lmb
  3481  
  3482  	numDeletes := int(num)
  3483  	if mb != nil {
  3484  		numDeletes += mb.dmap.Size()
  3485  	}
  3486  	if mb == nil || numDeletes > maxDeletes && mb.msgs > 0 || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize {
  3487  		if mb != nil && fs.fcfg.Compression != NoCompression {
  3488  			// We've now reached the end of this message block, if we want
  3489  			// to compress blocks then now's the time to do it.
  3490  			go mb.recompressOnDiskIfNeeded()
  3491  		}
  3492  		var err error
  3493  		if mb, err = fs.newMsgBlockForWrite(); err != nil {
  3494  			return err
  3495  		}
  3496  	}
  3497  
  3498  	// Insert into dmap all entries and place last as marker.
  3499  	now := time.Now().UTC()
  3500  	nowts := now.UnixNano()
  3501  	lseq := seq + num - 1
  3502  
  3503  	mb.mu.Lock()
  3504  	var needsRecord bool
  3505  	// If we are empty update meta directly.
  3506  	if mb.msgs == 0 {
  3507  		atomic.StoreUint64(&mb.last.seq, lseq)
  3508  		mb.last.ts = nowts
  3509  		atomic.StoreUint64(&mb.first.seq, lseq+1)
  3510  		mb.first.ts = nowts
  3511  	} else {
  3512  		needsRecord = true
  3513  		for ; seq <= lseq; seq++ {
  3514  			mb.dmap.Insert(seq)
  3515  		}
  3516  	}
  3517  	mb.mu.Unlock()
  3518  
  3519  	// Write out our placeholder.
  3520  	if needsRecord {
  3521  		mb.writeMsgRecord(emptyRecordLen, lseq|ebit, _EMPTY_, nil, nil, nowts, true)
  3522  	}
  3523  
  3524  	// Now update FS accounting.
  3525  	// Update fs state.
  3526  	fs.state.LastSeq, fs.state.LastTime = lseq, now
  3527  	if fs.state.Msgs == 0 {
  3528  		fs.state.FirstSeq, fs.state.FirstTime = lseq+1, now
  3529  	}
  3530  
  3531  	// Mark as dirty for stream state.
  3532  	fs.dirty++
  3533  
  3534  	return nil
  3535  }
  3536  
  3537  // Lock should be held.
  3538  func (fs *fileStore) rebuildFirst() {
  3539  	if len(fs.blks) == 0 {
  3540  		return
  3541  	}
  3542  	fmb := fs.blks[0]
  3543  	if fmb == nil {
  3544  		return
  3545  	}
  3546  
  3547  	ld, _, _ := fmb.rebuildState()
  3548  	fmb.mu.RLock()
  3549  	isEmpty := fmb.msgs == 0
  3550  	fmb.mu.RUnlock()
  3551  	if isEmpty {
  3552  		fmb.mu.Lock()
  3553  		fs.removeMsgBlock(fmb)
  3554  		fmb.mu.Unlock()
  3555  	}
  3556  	fs.selectNextFirst()
  3557  	fs.rebuildStateLocked(ld)
  3558  }
  3559  
  3560  // Optimized helper function to return first sequence.
  3561  // subj will always be publish subject here, meaning non-wildcard.
  3562  // We assume a fast check that this subj even exists already happened.
  3563  // Lock should be held.
  3564  func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) {
  3565  	if len(fs.blks) == 0 {
  3566  		return 0, nil
  3567  	}
  3568  
  3569  	// See if we can optimize where we start.
  3570  	start, stop := fs.blks[0].index, fs.lmb.index
  3571  	if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
  3572  		start, stop = info.fblk, info.lblk
  3573  	}
  3574  
  3575  	for i := start; i <= stop; i++ {
  3576  		mb := fs.bim[i]
  3577  		if mb == nil {
  3578  			continue
  3579  		}
  3580  		mb.mu.Lock()
  3581  		var shouldExpire bool
  3582  		if mb.fssNotLoaded() {
  3583  			// Make sure we have fss loaded.
  3584  			if err := mb.loadMsgsWithLock(); err != nil {
  3585  				mb.mu.Unlock()
  3586  				return 0, err
  3587  			}
  3588  			shouldExpire = true
  3589  		}
  3590  		if ss := mb.fss[subj]; ss != nil {
  3591  			// Adjust first if it was not where we thought it should be.
  3592  			if i != start {
  3593  				if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
  3594  					info.fblk = i
  3595  				}
  3596  			}
  3597  			if ss.firstNeedsUpdate {
  3598  				mb.recalculateFirstForSubj(subj, ss.First, ss)
  3599  			}
  3600  			mb.mu.Unlock()
  3601  			return ss.First, nil
  3602  		}
  3603  		// If we did not find it and we loaded this msgBlock try to expire as long as not the last.
  3604  		if shouldExpire {
  3605  			// Expire this cache before moving on.
  3606  			mb.tryForceExpireCacheLocked()
  3607  		}
  3608  		mb.mu.Unlock()
  3609  	}
  3610  	return 0, nil
  3611  }
  3612  
  3613  // Will check the msg limit and drop firstSeq msg if needed.
  3614  // Lock should be held.
  3615  func (fs *fileStore) enforceMsgLimit() {
  3616  	if fs.cfg.Discard != DiscardOld {
  3617  		return
  3618  	}
  3619  	if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) {
  3620  		return
  3621  	}
  3622  	for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs {
  3623  		if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
  3624  			fs.rebuildFirst()
  3625  			return
  3626  		}
  3627  	}
  3628  }
  3629  
  3630  // Will check the bytes limit and drop msgs if needed.
  3631  // Lock should be held.
  3632  func (fs *fileStore) enforceBytesLimit() {
  3633  	if fs.cfg.Discard != DiscardOld {
  3634  		return
  3635  	}
  3636  	if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) {
  3637  		return
  3638  	}
  3639  	for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes {
  3640  		if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
  3641  			fs.rebuildFirst()
  3642  			return
  3643  		}
  3644  	}
  3645  }
  3646  
  3647  // Will make sure we have limits honored for max msgs per subject on recovery or config update.
  3648  // We will make sure to go through all msg blocks etc. but in practice this
  3649  // will most likely only be the last one, so can take a more conservative approach.
  3650  // Lock should be held.
  3651  func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) {
  3652  	maxMsgsPer := uint64(fs.cfg.MaxMsgsPer)
  3653  
  3654  	// We may want to suppress callbacks from remove during this process
  3655  	// since these should have already been deleted and accounted for.
  3656  	if !fireCallback {
  3657  		cb := fs.scb
  3658  		fs.scb = nil
  3659  		defer func() { fs.scb = cb }()
  3660  	}
  3661  
  3662  	var numMsgs uint64
  3663  
  3664  	// collect all that are not correct.
  3665  	needAttention := make(map[string]*psi)
  3666  	fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) {
  3667  		numMsgs += psi.total
  3668  		if psi.total > maxMsgsPer {
  3669  			needAttention[string(subj)] = psi
  3670  		}
  3671  	})
  3672  
  3673  	// We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught.
  3674  	// So do a quick sanity check here. If we detect a skew do a rebuild then re-check.
  3675  	if numMsgs != fs.state.Msgs {
  3676  		fs.warn("Detected skew in subject-based total (%d) vs raw total (%d), rebuilding", numMsgs, fs.state.Msgs)
  3677  		// Clear any global subject state.
  3678  		fs.psim, fs.tsl = fs.psim.Empty(), 0
  3679  		for _, mb := range fs.blks {
  3680  			ld, _, err := mb.rebuildState()
  3681  			if err != nil && ld != nil {
  3682  				fs.addLostData(ld)
  3683  			}
  3684  			fs.populateGlobalPerSubjectInfo(mb)
  3685  		}
  3686  		// Rebuild fs state too.
  3687  		fs.rebuildStateLocked(nil)
  3688  		// Need to redo blocks that need attention.
  3689  		needAttention = make(map[string]*psi)
  3690  		fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) {
  3691  			if psi.total > maxMsgsPer {
  3692  				needAttention[string(subj)] = psi
  3693  			}
  3694  		})
  3695  	}
  3696  
  3697  	// Collect all the msgBlks we alter.
  3698  	blks := make(map[*msgBlock]struct{})
  3699  
  3700  	// For re-use below.
  3701  	var sm StoreMsg
  3702  
  3703  	// Walk all subjects that need attention here.
  3704  	for subj, info := range needAttention {
  3705  		total, start, stop := info.total, info.fblk, info.lblk
  3706  
  3707  		for i := start; i <= stop; i++ {
  3708  			mb := fs.bim[i]
  3709  			if mb == nil {
  3710  				continue
  3711  			}
  3712  			// Grab the ss entry for this subject in case sparse.
  3713  			mb.mu.Lock()
  3714  			mb.ensurePerSubjectInfoLoaded()
  3715  			ss := mb.fss[subj]
  3716  			if ss != nil && ss.firstNeedsUpdate {
  3717  				mb.recalculateFirstForSubj(subj, ss.First, ss)
  3718  			}
  3719  			mb.mu.Unlock()
  3720  			if ss == nil {
  3721  				continue
  3722  			}
  3723  			for seq := ss.First; seq <= ss.Last && total > maxMsgsPer; {
  3724  				m, _, err := mb.firstMatching(subj, false, seq, &sm)
  3725  				if err == nil {
  3726  					seq = m.seq + 1
  3727  					if removed, _ := fs.removeMsgViaLimits(m.seq); removed {
  3728  						total--
  3729  						blks[mb] = struct{}{}
  3730  					}
  3731  				} else {
  3732  					// On error just do single increment.
  3733  					seq++
  3734  				}
  3735  			}
  3736  		}
  3737  	}
  3738  
  3739  	// Expire the cache if we can.
  3740  	for mb := range blks {
  3741  		mb.mu.Lock()
  3742  		if mb.msgs > 0 {
  3743  			mb.tryForceExpireCacheLocked()
  3744  		}
  3745  		mb.mu.Unlock()
  3746  	}
  3747  }
  3748  
  3749  // Lock should be held.
  3750  func (fs *fileStore) deleteFirstMsg() (bool, error) {
  3751  	return fs.removeMsgViaLimits(fs.state.FirstSeq)
  3752  }
  3753  
  3754  // If we remove via limits that can always be recovered on a restart we
  3755  // do not force the system to update the index file.
  3756  // Lock should be held.
  3757  func (fs *fileStore) removeMsgViaLimits(seq uint64) (bool, error) {
  3758  	return fs.removeMsg(seq, false, true, false)
  3759  }
  3760  
  3761  // RemoveMsg will remove the message from this store.
  3762  // Will return the number of bytes removed.
  3763  func (fs *fileStore) RemoveMsg(seq uint64) (bool, error) {
  3764  	return fs.removeMsg(seq, false, false, true)
  3765  }
  3766  
  3767  func (fs *fileStore) EraseMsg(seq uint64) (bool, error) {
  3768  	return fs.removeMsg(seq, true, false, true)
  3769  }
  3770  
  3771  // Convenience function to remove per subject tracking at the filestore level.
  3772  // Lock should be held.
  3773  func (fs *fileStore) removePerSubject(subj string) {
  3774  	if len(subj) == 0 || fs.psim == nil {
  3775  		return
  3776  	}
  3777  	// We do not update sense of fblk here but will do so when we resolve during lookup.
  3778  	bsubj := stringToBytes(subj)
  3779  	if info, ok := fs.psim.Find(bsubj); ok {
  3780  		info.total--
  3781  		if info.total == 1 {
  3782  			info.fblk = info.lblk
  3783  		} else if info.total == 0 {
  3784  			if _, ok = fs.psim.Delete(bsubj); ok {
  3785  				fs.tsl -= len(subj)
  3786  			}
  3787  		}
  3788  	}
  3789  }
  3790  
  3791  // Remove a message, optionally rewriting the mb file.
  3792  func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) (bool, error) {
  3793  	if seq == 0 {
  3794  		return false, ErrStoreMsgNotFound
  3795  	}
  3796  	fsLock := func() {
  3797  		if needFSLock {
  3798  			fs.mu.Lock()
  3799  		}
  3800  	}
  3801  	fsUnlock := func() {
  3802  		if needFSLock {
  3803  			fs.mu.Unlock()
  3804  		}
  3805  	}
  3806  
  3807  	fsLock()
  3808  
  3809  	if fs.closed {
  3810  		fsUnlock()
  3811  		return false, ErrStoreClosed
  3812  	}
  3813  	if !viaLimits && fs.sips > 0 {
  3814  		fsUnlock()
  3815  		return false, ErrStoreSnapshotInProgress
  3816  	}
  3817  	// If in encrypted mode negate secure rewrite here.
  3818  	if secure && fs.prf != nil {
  3819  		secure = false
  3820  	}
  3821  
  3822  	mb := fs.selectMsgBlock(seq)
  3823  	if mb == nil {
  3824  		var err = ErrStoreEOF
  3825  		if seq <= fs.state.LastSeq {
  3826  			err = ErrStoreMsgNotFound
  3827  		}
  3828  		fsUnlock()
  3829  		return false, err
  3830  	}
  3831  
  3832  	mb.mu.Lock()
  3833  
  3834  	// See if we are closed or the sequence number is still relevant or if we know its deleted.
  3835  	if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) || mb.dmap.Exists(seq) {
  3836  		mb.mu.Unlock()
  3837  		fsUnlock()
  3838  		return false, nil
  3839  	}
  3840  
  3841  	// We used to not have to load in the messages except with callbacks or the filtered subject state (which is now always on).
  3842  	// Now just load regardless.
  3843  	// TODO(dlc) - Figure out a way not to have to load it in, we need subject tracking outside main data block.
  3844  	if mb.cacheNotLoaded() {
  3845  		if err := mb.loadMsgsWithLock(); err != nil {
  3846  			mb.mu.Unlock()
  3847  			fsUnlock()
  3848  			return false, err
  3849  		}
  3850  	}
  3851  
  3852  	var smv StoreMsg
  3853  	sm, err := mb.cacheLookup(seq, &smv)
  3854  	if err != nil {
  3855  		mb.mu.Unlock()
  3856  		fsUnlock()
  3857  		// Mimic err behavior from above check to dmap. No error returned if already removed.
  3858  		if err == errDeletedMsg {
  3859  			err = nil
  3860  		}
  3861  		return false, err
  3862  	}
  3863  	// Grab size
  3864  	msz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  3865  
  3866  	// Set cache timestamp for last remove.
  3867  	mb.lrts = time.Now().UnixNano()
  3868  
  3869  	// Global stats
  3870  	if fs.state.Msgs > 0 {
  3871  		fs.state.Msgs--
  3872  	}
  3873  	if msz < fs.state.Bytes {
  3874  		fs.state.Bytes -= msz
  3875  	} else {
  3876  		fs.state.Bytes = 0
  3877  	}
  3878  
  3879  	// Now local mb updates.
  3880  	if mb.msgs > 0 {
  3881  		mb.msgs--
  3882  	}
  3883  	if msz < mb.bytes {
  3884  		mb.bytes -= msz
  3885  	} else {
  3886  		mb.bytes = 0
  3887  	}
  3888  
  3889  	// Mark as dirty for stream state.
  3890  	fs.dirty++
  3891  
  3892  	// If we are tracking subjects here make sure we update that accounting.
  3893  	mb.ensurePerSubjectInfoLoaded()
  3894  
  3895  	// If we are tracking multiple subjects here make sure we update that accounting.
  3896  	mb.removeSeqPerSubject(sm.subj, seq)
  3897  	fs.removePerSubject(sm.subj)
  3898  
  3899  	if secure {
  3900  		// Grab record info.
  3901  		ri, rl, _, _ := mb.slotInfo(int(seq - mb.cache.fseq))
  3902  		if err := mb.eraseMsg(seq, int(ri), int(rl)); err != nil {
  3903  			return false, err
  3904  		}
  3905  	}
  3906  
  3907  	fifo := seq == atomic.LoadUint64(&mb.first.seq)
  3908  	isLastBlock := mb == fs.lmb
  3909  	isEmpty := mb.msgs == 0
  3910  
  3911  	if fifo {
  3912  		mb.selectNextFirst()
  3913  		if !isEmpty {
  3914  			// Can update this one in place.
  3915  			if seq == fs.state.FirstSeq {
  3916  				fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one.
  3917  				if mb.first.ts == 0 {
  3918  					fs.state.FirstTime = time.Time{}
  3919  				} else {
  3920  					fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  3921  				}
  3922  			}
  3923  		}
  3924  	} else if !isEmpty {
  3925  		// Out of order delete.
  3926  		mb.dmap.Insert(seq)
  3927  		// Make simple check here similar to Compact(). If we can save 50% and over a certain threshold do inline.
  3928  		// All other more thorough cleanup will happen in syncBlocks logic.
  3929  		// Note that we do not have to store empty records for the deleted, so don't use to calculate.
  3930  		// TODO(dlc) - This should not be inline, should kick the sync routine.
  3931  		if mb.rbytes > compactMinimum && mb.bytes*2 < mb.rbytes && !isLastBlock {
  3932  			mb.compact()
  3933  		}
  3934  	}
  3935  
  3936  	if secure {
  3937  		if ld, _ := mb.flushPendingMsgsLocked(); ld != nil {
  3938  			// We have the mb lock here, this needs the mb locks so do in its own go routine.
  3939  			go fs.rebuildState(ld)
  3940  		}
  3941  	}
  3942  
  3943  	// If empty remove this block and check if we need to update first sequence.
  3944  	// We will write a tombstone at the end.
  3945  	var firstSeqNeedsUpdate bool
  3946  	if isEmpty {
  3947  		// This writes tombstone iff mb == lmb, so no need to do below.
  3948  		fs.removeMsgBlock(mb)
  3949  		firstSeqNeedsUpdate = seq == fs.state.FirstSeq
  3950  	}
  3951  	mb.mu.Unlock()
  3952  
  3953  	// If we emptied the current message block and the seq was state.FirstSeq
  3954  	// then we need to jump message blocks. We will also write the index so
  3955  	// we don't lose track of the first sequence.
  3956  	if firstSeqNeedsUpdate {
  3957  		fs.selectNextFirst()
  3958  	}
  3959  
  3960  	// Check if we need to write a deleted record tombstone.
  3961  	// This is for user initiated removes or to hold the first seq
  3962  	// when the last block is empty.
  3963  
  3964  	// If not via limits and not empty and last (empty writes tombstone above if last) write tombstone.
  3965  	if !viaLimits && !(isEmpty && isLastBlock) {
  3966  		if lmb := fs.lmb; sm != nil && lmb != nil {
  3967  			lmb.writeTombstone(sm.seq, sm.ts)
  3968  		}
  3969  	}
  3970  
  3971  	if cb := fs.scb; cb != nil {
  3972  		// If we have a callback registered we need to release lock regardless since cb might need it to lookup msg, etc.
  3973  		fs.mu.Unlock()
  3974  		// Storage updates.
  3975  		var subj string
  3976  		if sm != nil {
  3977  			subj = sm.subj
  3978  		}
  3979  		delta := int64(msz)
  3980  		cb(-1, -delta, seq, subj)
  3981  
  3982  		if !needFSLock {
  3983  			fs.mu.Lock()
  3984  		}
  3985  	} else if needFSLock {
  3986  		// We acquired it so release it.
  3987  		fs.mu.Unlock()
  3988  	}
  3989  
  3990  	return true, nil
  3991  }
  3992  
  3993  // This will compact and rewrite this block. This should only be called when we know we want to rewrite this block.
  3994  // This should not be called on the lmb since we will prune tail deleted messages which could cause issues with
  3995  // writing new messages. We will silently bail on any issues with the underlying block and let someone else detect.
  3996  // Write lock needs to be held.
  3997  func (mb *msgBlock) compact() {
  3998  	wasLoaded := mb.cacheAlreadyLoaded()
  3999  	if !wasLoaded {
  4000  		if err := mb.loadMsgsWithLock(); err != nil {
  4001  			return
  4002  		}
  4003  	}
  4004  
  4005  	buf := mb.cache.buf
  4006  	nbuf := getMsgBlockBuf(len(buf))
  4007  	// Recycle our nbuf when we are done.
  4008  	defer recycleMsgBlockBuf(nbuf)
  4009  
  4010  	var le = binary.LittleEndian
  4011  	var firstSet bool
  4012  
  4013  	fseq := atomic.LoadUint64(&mb.first.seq)
  4014  	isDeleted := func(seq uint64) bool {
  4015  		return seq == 0 || seq&ebit != 0 || mb.dmap.Exists(seq) || seq < fseq
  4016  	}
  4017  
  4018  	for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
  4019  		if index+msgHdrSize > lbuf {
  4020  			return
  4021  		}
  4022  		hdr := buf[index : index+msgHdrSize]
  4023  		rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
  4024  		// Clear any headers bit that could be set.
  4025  		rl &^= hbit
  4026  		dlen := int(rl) - msgHdrSize
  4027  		// Do some quick sanity checks here.
  4028  		if dlen < 0 || int(slen) > dlen || dlen > int(rl) || rl > rlBadThresh || index+rl > lbuf {
  4029  			return
  4030  		}
  4031  		// Only need to process non-deleted messages.
  4032  		seq := le.Uint64(hdr[4:])
  4033  
  4034  		if !isDeleted(seq) {
  4035  			// Check for tombstones.
  4036  			if seq&tbit != 0 {
  4037  				// If we are last mb we should consider to keep these unless the tombstone reflects a seq in this mb.
  4038  				if mb == mb.fs.lmb && seq < fseq {
  4039  					nbuf = append(nbuf, buf[index:index+rl]...)
  4040  				}
  4041  			} else {
  4042  				// Normal message here.
  4043  				nbuf = append(nbuf, buf[index:index+rl]...)
  4044  				if !firstSet {
  4045  					firstSet = true
  4046  					atomic.StoreUint64(&mb.first.seq, seq)
  4047  				}
  4048  			}
  4049  		}
  4050  		// Advance to next record.
  4051  		index += rl
  4052  	}
  4053  
  4054  	// Handle compression
  4055  	if mb.cmp != NoCompression {
  4056  		cbuf, err := mb.cmp.Compress(nbuf)
  4057  		if err != nil {
  4058  			return
  4059  		}
  4060  		meta := &CompressionInfo{
  4061  			Algorithm:    mb.cmp,
  4062  			OriginalSize: uint64(len(nbuf)),
  4063  		}
  4064  		nbuf = append(meta.MarshalMetadata(), cbuf...)
  4065  	}
  4066  
  4067  	// Check for encryption.
  4068  	if mb.bek != nil && len(nbuf) > 0 {
  4069  		// Recreate to reset counter.
  4070  		rbek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  4071  		if err != nil {
  4072  			return
  4073  		}
  4074  		rbek.XORKeyStream(nbuf, nbuf)
  4075  	}
  4076  
  4077  	// Close FDs first.
  4078  	mb.closeFDsLocked()
  4079  
  4080  	// We will write to a new file and mv/rename it in case of failure.
  4081  	mfn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(newScan, mb.index))
  4082  	<-dios
  4083  	err := os.WriteFile(mfn, nbuf, defaultFilePerms)
  4084  	dios <- struct{}{}
  4085  	if err != nil {
  4086  		os.Remove(mfn)
  4087  		return
  4088  	}
  4089  	if err := os.Rename(mfn, mb.mfn); err != nil {
  4090  		os.Remove(mfn)
  4091  		return
  4092  	}
  4093  
  4094  	// Capture the updated rbytes.
  4095  	mb.rbytes = uint64(len(nbuf))
  4096  
  4097  	// Remove any seqs from the beginning of the blk.
  4098  	for seq, nfseq := fseq, atomic.LoadUint64(&mb.first.seq); seq < nfseq; seq++ {
  4099  		mb.dmap.Delete(seq)
  4100  	}
  4101  	// Make sure we clear the cache since no longer valid.
  4102  	mb.clearCacheAndOffset()
  4103  	// If we entered with the msgs loaded make sure to reload them.
  4104  	if wasLoaded {
  4105  		mb.loadMsgsWithLock()
  4106  	}
  4107  }
  4108  
  4109  // Grab info from a slot.
  4110  // Lock should be held.
  4111  func (mb *msgBlock) slotInfo(slot int) (uint32, uint32, bool, error) {
  4112  	if mb.cache == nil || slot >= len(mb.cache.idx) {
  4113  		return 0, 0, false, errPartialCache
  4114  	}
  4115  
  4116  	bi := mb.cache.idx[slot]
  4117  	ri, hashChecked := (bi &^ hbit), (bi&hbit) != 0
  4118  
  4119  	// If this is a deleted slot return here.
  4120  	if bi == dbit {
  4121  		return 0, 0, false, errDeletedMsg
  4122  	}
  4123  
  4124  	// Determine record length
  4125  	var rl uint32
  4126  	if slot >= len(mb.cache.idx) {
  4127  		rl = mb.cache.lrl
  4128  	} else {
  4129  		// Need to account for dbit markers in idx.
  4130  		// So we will walk until we find valid idx slot to calculate rl.
  4131  		for i := 1; slot+i < len(mb.cache.idx); i++ {
  4132  			ni := mb.cache.idx[slot+i] &^ hbit
  4133  			if ni == dbit {
  4134  				continue
  4135  			}
  4136  			rl = ni - ri
  4137  			break
  4138  		}
  4139  		// check if we had all trailing dbits.
  4140  		// If so use len of cache buf minus ri.
  4141  		if rl == 0 {
  4142  			rl = uint32(len(mb.cache.buf)) - ri
  4143  		}
  4144  	}
  4145  	if rl < msgHdrSize {
  4146  		return 0, 0, false, errBadMsg
  4147  	}
  4148  	return uint32(ri), rl, hashChecked, nil
  4149  }
  4150  
  4151  func (fs *fileStore) isClosed() bool {
  4152  	fs.mu.RLock()
  4153  	closed := fs.closed
  4154  	fs.mu.RUnlock()
  4155  	return closed
  4156  }
  4157  
  4158  // Will spin up our flush loop.
  4159  func (mb *msgBlock) spinUpFlushLoop() {
  4160  	mb.mu.Lock()
  4161  	defer mb.mu.Unlock()
  4162  
  4163  	// Are we already running or closed?
  4164  	if mb.flusher || mb.closed {
  4165  		return
  4166  	}
  4167  	mb.flusher = true
  4168  	mb.fch = make(chan struct{}, 1)
  4169  	mb.qch = make(chan struct{})
  4170  	fch, qch := mb.fch, mb.qch
  4171  
  4172  	go mb.flushLoop(fch, qch)
  4173  }
  4174  
  4175  // Raw low level kicker for flush loops.
  4176  func kickFlusher(fch chan struct{}) {
  4177  	if fch != nil {
  4178  		select {
  4179  		case fch <- struct{}{}:
  4180  		default:
  4181  		}
  4182  	}
  4183  }
  4184  
  4185  // Kick flusher for this message block.
  4186  func (mb *msgBlock) kickFlusher() {
  4187  	mb.mu.RLock()
  4188  	defer mb.mu.RUnlock()
  4189  	kickFlusher(mb.fch)
  4190  }
  4191  
  4192  func (mb *msgBlock) setInFlusher() {
  4193  	mb.mu.Lock()
  4194  	mb.flusher = true
  4195  	mb.mu.Unlock()
  4196  }
  4197  
  4198  func (mb *msgBlock) clearInFlusher() {
  4199  	mb.mu.Lock()
  4200  	mb.flusher = false
  4201  	mb.mu.Unlock()
  4202  }
  4203  
  4204  // flushLoop watches for messages, index info, or recently closed msg block updates.
  4205  func (mb *msgBlock) flushLoop(fch, qch chan struct{}) {
  4206  	mb.setInFlusher()
  4207  	defer mb.clearInFlusher()
  4208  
  4209  	for {
  4210  		select {
  4211  		case <-fch:
  4212  			// If we have pending messages process them first.
  4213  			if waiting := mb.pendingWriteSize(); waiting != 0 {
  4214  				ts := 1 * time.Millisecond
  4215  				var waited time.Duration
  4216  
  4217  				for waiting < coalesceMinimum {
  4218  					time.Sleep(ts)
  4219  					select {
  4220  					case <-qch:
  4221  						return
  4222  					default:
  4223  					}
  4224  					newWaiting := mb.pendingWriteSize()
  4225  					if waited = waited + ts; waited > maxFlushWait || newWaiting <= waiting {
  4226  						break
  4227  					}
  4228  					waiting = newWaiting
  4229  					ts *= 2
  4230  				}
  4231  				mb.flushPendingMsgs()
  4232  				// Check if we are no longer the last message block. If we are
  4233  				// not we can close FDs and exit.
  4234  				mb.fs.mu.RLock()
  4235  				notLast := mb != mb.fs.lmb
  4236  				mb.fs.mu.RUnlock()
  4237  				if notLast {
  4238  					if err := mb.closeFDs(); err == nil {
  4239  						return
  4240  					}
  4241  				}
  4242  			}
  4243  		case <-qch:
  4244  			return
  4245  		}
  4246  	}
  4247  }
  4248  
  4249  // Lock should be held.
  4250  func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error {
  4251  	var le = binary.LittleEndian
  4252  	var hdr [msgHdrSize]byte
  4253  
  4254  	le.PutUint32(hdr[0:], uint32(rl))
  4255  	le.PutUint64(hdr[4:], seq|ebit)
  4256  	le.PutUint64(hdr[12:], 0)
  4257  	le.PutUint16(hdr[20:], 0)
  4258  
  4259  	// Randomize record
  4260  	data := make([]byte, rl-emptyRecordLen)
  4261  	if n, err := rand.Read(data); err != nil {
  4262  		return err
  4263  	} else if n != len(data) {
  4264  		return fmt.Errorf("not enough overwrite bytes read (%d != %d)", n, len(data))
  4265  	}
  4266  
  4267  	// Now write to underlying buffer.
  4268  	var b bytes.Buffer
  4269  	b.Write(hdr[:])
  4270  	b.Write(data)
  4271  
  4272  	// Calculate hash.
  4273  	mb.hh.Reset()
  4274  	mb.hh.Write(hdr[4:20])
  4275  	mb.hh.Write(data)
  4276  	checksum := mb.hh.Sum(nil)
  4277  	// Write to msg record.
  4278  	b.Write(checksum)
  4279  
  4280  	// Update both cache and disk.
  4281  	nbytes := b.Bytes()
  4282  
  4283  	// Cache
  4284  	if ri >= mb.cache.off {
  4285  		li := ri - mb.cache.off
  4286  		buf := mb.cache.buf[li : li+rl]
  4287  		copy(buf, nbytes)
  4288  	}
  4289  
  4290  	// Disk
  4291  	if mb.cache.off+mb.cache.wp > ri {
  4292  		<-dios
  4293  		mfd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
  4294  		dios <- struct{}{}
  4295  		if err != nil {
  4296  			return err
  4297  		}
  4298  		defer mfd.Close()
  4299  		if _, err = mfd.WriteAt(nbytes, int64(ri)); err == nil {
  4300  			mfd.Sync()
  4301  		}
  4302  		if err != nil {
  4303  			return err
  4304  		}
  4305  	}
  4306  	return nil
  4307  }
  4308  
  4309  // Truncate this message block to the storedMsg.
  4310  func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) {
  4311  	mb.mu.Lock()
  4312  	defer mb.mu.Unlock()
  4313  
  4314  	// Make sure we are loaded to process messages etc.
  4315  	if err := mb.loadMsgsWithLock(); err != nil {
  4316  		return 0, 0, err
  4317  	}
  4318  
  4319  	// Calculate new eof using slot info from our new last sm.
  4320  	ri, rl, _, err := mb.slotInfo(int(sm.seq - mb.cache.fseq))
  4321  	if err != nil {
  4322  		return 0, 0, err
  4323  	}
  4324  	// Calculate new eof.
  4325  	eof := int64(ri + rl)
  4326  
  4327  	var purged, bytes uint64
  4328  
  4329  	checkDmap := mb.dmap.Size() > 0
  4330  	var smv StoreMsg
  4331  
  4332  	for seq := atomic.LoadUint64(&mb.last.seq); seq > sm.seq; seq-- {
  4333  		if checkDmap {
  4334  			if mb.dmap.Exists(seq) {
  4335  				// Delete and skip to next.
  4336  				mb.dmap.Delete(seq)
  4337  				checkDmap = !mb.dmap.IsEmpty()
  4338  				continue
  4339  			}
  4340  		}
  4341  		// We should have a valid msg to calculate removal stats.
  4342  		if m, err := mb.cacheLookup(seq, &smv); err == nil {
  4343  			if mb.msgs > 0 {
  4344  				rl := fileStoreMsgSize(m.subj, m.hdr, m.msg)
  4345  				mb.msgs--
  4346  				if rl > mb.bytes {
  4347  					rl = mb.bytes
  4348  				}
  4349  				mb.bytes -= rl
  4350  				mb.rbytes -= rl
  4351  				// For return accounting.
  4352  				purged++
  4353  				bytes += uint64(rl)
  4354  			}
  4355  		}
  4356  	}
  4357  
  4358  	// If the block is compressed then we have to load it into memory
  4359  	// and decompress it, truncate it and then write it back out.
  4360  	// Otherwise, truncate the file itself and close the descriptor.
  4361  	if mb.cmp != NoCompression {
  4362  		buf, err := mb.loadBlock(nil)
  4363  		if err != nil {
  4364  			return 0, 0, fmt.Errorf("failed to load block from disk: %w", err)
  4365  		}
  4366  		if mb.bek != nil && len(buf) > 0 {
  4367  			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  4368  			if err != nil {
  4369  				return 0, 0, err
  4370  			}
  4371  			mb.bek = bek
  4372  			mb.bek.XORKeyStream(buf, buf)
  4373  		}
  4374  		buf, err = mb.decompressIfNeeded(buf)
  4375  		if err != nil {
  4376  			return 0, 0, fmt.Errorf("failed to decompress block: %w", err)
  4377  		}
  4378  		buf = buf[:eof]
  4379  		copy(mb.lchk[0:], buf[:len(buf)-checksumSize])
  4380  		buf, err = mb.cmp.Compress(buf)
  4381  		if err != nil {
  4382  			return 0, 0, fmt.Errorf("failed to recompress block: %w", err)
  4383  		}
  4384  		meta := &CompressionInfo{
  4385  			Algorithm:    mb.cmp,
  4386  			OriginalSize: uint64(eof),
  4387  		}
  4388  		buf = append(meta.MarshalMetadata(), buf...)
  4389  		if mb.bek != nil && len(buf) > 0 {
  4390  			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  4391  			if err != nil {
  4392  				return 0, 0, err
  4393  			}
  4394  			mb.bek = bek
  4395  			mb.bek.XORKeyStream(buf, buf)
  4396  		}
  4397  		n, err := mb.writeAt(buf, 0)
  4398  		if err != nil {
  4399  			return 0, 0, fmt.Errorf("failed to rewrite compressed block: %w", err)
  4400  		}
  4401  		if n != len(buf) {
  4402  			return 0, 0, fmt.Errorf("short write (%d != %d)", n, len(buf))
  4403  		}
  4404  		mb.mfd.Truncate(int64(len(buf)))
  4405  		mb.mfd.Sync()
  4406  	} else if mb.mfd != nil {
  4407  		mb.mfd.Truncate(eof)
  4408  		mb.mfd.Sync()
  4409  		// Update our checksum.
  4410  		var lchk [8]byte
  4411  		mb.mfd.ReadAt(lchk[:], eof-8)
  4412  		copy(mb.lchk[0:], lchk[:])
  4413  	} else {
  4414  		return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index)
  4415  	}
  4416  
  4417  	// Update our last msg.
  4418  	atomic.StoreUint64(&mb.last.seq, sm.seq)
  4419  	mb.last.ts = sm.ts
  4420  
  4421  	// Clear our cache.
  4422  	mb.clearCacheAndOffset()
  4423  
  4424  	// Redo per subject info for this block.
  4425  	mb.resetPerSubjectInfo()
  4426  
  4427  	// Load msgs again.
  4428  	mb.loadMsgsWithLock()
  4429  
  4430  	return purged, bytes, nil
  4431  }
  4432  
  4433  // Helper to determine if the mb is empty.
  4434  func (mb *msgBlock) isEmpty() bool {
  4435  	return atomic.LoadUint64(&mb.first.seq) > atomic.LoadUint64(&mb.last.seq)
  4436  }
  4437  
  4438  // Lock should be held.
  4439  func (mb *msgBlock) selectNextFirst() {
  4440  	var seq uint64
  4441  	fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  4442  	for seq = fseq + 1; seq <= lseq; seq++ {
  4443  		if mb.dmap.Exists(seq) {
  4444  			// We will move past this so we can delete the entry.
  4445  			mb.dmap.Delete(seq)
  4446  		} else {
  4447  			break
  4448  		}
  4449  	}
  4450  	// Set new first sequence.
  4451  	atomic.StoreUint64(&mb.first.seq, seq)
  4452  
  4453  	// Check if we are empty..
  4454  	if seq > lseq {
  4455  		mb.first.ts = 0
  4456  		return
  4457  	}
  4458  
  4459  	// Need to get the timestamp.
  4460  	// We will try the cache direct and fallback if needed.
  4461  	var smv StoreMsg
  4462  	sm, _ := mb.cacheLookup(seq, &smv)
  4463  	if sm == nil {
  4464  		// Slow path, need to unlock.
  4465  		mb.mu.Unlock()
  4466  		sm, _, _ = mb.fetchMsg(seq, &smv)
  4467  		mb.mu.Lock()
  4468  	}
  4469  	if sm != nil {
  4470  		mb.first.ts = sm.ts
  4471  	} else {
  4472  		mb.first.ts = 0
  4473  	}
  4474  }
  4475  
  4476  // Select the next FirstSeq
  4477  // Lock should be held.
  4478  func (fs *fileStore) selectNextFirst() {
  4479  	if len(fs.blks) > 0 {
  4480  		mb := fs.blks[0]
  4481  		mb.mu.RLock()
  4482  		fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq)
  4483  		fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  4484  		mb.mu.RUnlock()
  4485  	} else {
  4486  		// Could not find anything, so treat like purge
  4487  		fs.state.FirstSeq = fs.state.LastSeq + 1
  4488  		fs.state.FirstTime = time.Time{}
  4489  	}
  4490  }
  4491  
  4492  // Lock should be held.
  4493  func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) {
  4494  	if td == 0 {
  4495  		td = mb.cexp + 100*time.Millisecond
  4496  	}
  4497  	if mb.ctmr == nil {
  4498  		mb.ctmr = time.AfterFunc(td, mb.expireCache)
  4499  	} else {
  4500  		mb.ctmr.Reset(td)
  4501  	}
  4502  }
  4503  
  4504  // Lock should be held.
  4505  func (mb *msgBlock) startCacheExpireTimer() {
  4506  	mb.resetCacheExpireTimer(0)
  4507  }
  4508  
  4509  // Used when we load in a message block.
  4510  // Lock should be held.
  4511  func (mb *msgBlock) clearCacheAndOffset() {
  4512  	// Reset linear scan tracker.
  4513  	mb.llseq = 0
  4514  	if mb.cache != nil {
  4515  		mb.cache.off = 0
  4516  		mb.cache.wp = 0
  4517  	}
  4518  	mb.clearCache()
  4519  }
  4520  
  4521  // Lock should be held.
  4522  func (mb *msgBlock) clearCache() {
  4523  	if mb.ctmr != nil && mb.fss == nil {
  4524  		mb.ctmr.Stop()
  4525  		mb.ctmr = nil
  4526  	}
  4527  
  4528  	if mb.cache == nil {
  4529  		return
  4530  	}
  4531  
  4532  	buf := mb.cache.buf
  4533  	if mb.cache.off == 0 {
  4534  		mb.cache = nil
  4535  	} else {
  4536  		// Clear msgs and index.
  4537  		mb.cache.buf = nil
  4538  		mb.cache.idx = nil
  4539  		mb.cache.wp = 0
  4540  	}
  4541  	recycleMsgBlockBuf(buf)
  4542  }
  4543  
  4544  // Called to possibly expire a message block cache.
  4545  func (mb *msgBlock) expireCache() {
  4546  	mb.mu.Lock()
  4547  	defer mb.mu.Unlock()
  4548  	mb.expireCacheLocked()
  4549  }
  4550  
  4551  func (mb *msgBlock) tryForceExpireCache() {
  4552  	mb.mu.Lock()
  4553  	defer mb.mu.Unlock()
  4554  	mb.tryForceExpireCacheLocked()
  4555  }
  4556  
  4557  // We will attempt to force expire this by temporarily clearing the last load time.
  4558  func (mb *msgBlock) tryForceExpireCacheLocked() {
  4559  	llts := mb.llts
  4560  	mb.llts = 0
  4561  	mb.expireCacheLocked()
  4562  	mb.llts = llts
  4563  }
  4564  
  4565  // This is for expiration of the write cache, which will be partial with fip.
  4566  // So we want to bypass the Pools here.
  4567  // Lock should be held.
  4568  func (mb *msgBlock) tryExpireWriteCache() []byte {
  4569  	if mb.cache == nil {
  4570  		return nil
  4571  	}
  4572  	lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra
  4573  	mb.lwts, mb.cache.nra = 0, true
  4574  	mb.expireCacheLocked()
  4575  	mb.lwts = lwts
  4576  	if mb.cache != nil {
  4577  		mb.cache.nra = nra
  4578  	}
  4579  	// We could check for a certain time since last load, but to be safe just reuse if no loads at all.
  4580  	if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) {
  4581  		// Clear last write time since we now are about to move on to a new lmb.
  4582  		mb.lwts = 0
  4583  		return buf[:0]
  4584  	}
  4585  	return nil
  4586  }
  4587  
  4588  // Lock should be held.
  4589  func (mb *msgBlock) expireCacheLocked() {
  4590  	if mb.cache == nil {
  4591  		if mb.ctmr != nil {
  4592  			mb.ctmr.Stop()
  4593  			mb.ctmr = nil
  4594  		}
  4595  		return
  4596  	}
  4597  
  4598  	// Can't expire if we still have pending.
  4599  	if mb.cache != nil && len(mb.cache.buf)-int(mb.cache.wp) > 0 {
  4600  		mb.resetCacheExpireTimer(mb.cexp)
  4601  		return
  4602  	}
  4603  
  4604  	// Grab timestamp to compare.
  4605  	tns := time.Now().UnixNano()
  4606  
  4607  	// For the core buffer of messages, we care about reads and writes, but not removes.
  4608  	bufts := mb.llts
  4609  	if mb.lwts > bufts {
  4610  		bufts = mb.lwts
  4611  	}
  4612  
  4613  	// Check for activity on the cache that would prevent us from expiring.
  4614  	if tns-bufts <= int64(mb.cexp) {
  4615  		mb.resetCacheExpireTimer(mb.cexp - time.Duration(tns-bufts))
  4616  		return
  4617  	}
  4618  
  4619  	// If we are here we will at least expire the core msg buffer.
  4620  	// We need to capture offset in case we do a write next before a full load.
  4621  	if mb.cache != nil {
  4622  		mb.cache.off += len(mb.cache.buf)
  4623  		if !mb.cache.nra {
  4624  			recycleMsgBlockBuf(mb.cache.buf)
  4625  		}
  4626  		mb.cache.buf = nil
  4627  		mb.cache.wp = 0
  4628  	}
  4629  
  4630  	// Check if we can clear out our idx unless under force expire.
  4631  	// fss we keep longer and expire under sync timer checks.
  4632  	mb.clearCache()
  4633  }
  4634  
  4635  func (fs *fileStore) startAgeChk() {
  4636  	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
  4637  		fs.ageChk = time.AfterFunc(fs.cfg.MaxAge, fs.expireMsgs)
  4638  	}
  4639  }
  4640  
  4641  // Lock should be held.
  4642  func (fs *fileStore) resetAgeChk(delta int64) {
  4643  	if fs.cfg.MaxAge == 0 {
  4644  		return
  4645  	}
  4646  
  4647  	fireIn := fs.cfg.MaxAge
  4648  	if delta > 0 && time.Duration(delta) < fireIn {
  4649  		if fireIn = time.Duration(delta); fireIn < 250*time.Millisecond {
  4650  			// Only fire at most once every 250ms.
  4651  			// Excessive firing can effect ingest performance.
  4652  			fireIn = time.Second
  4653  		}
  4654  	}
  4655  	if fs.ageChk != nil {
  4656  		fs.ageChk.Reset(fireIn)
  4657  	} else {
  4658  		fs.ageChk = time.AfterFunc(fireIn, fs.expireMsgs)
  4659  	}
  4660  }
  4661  
  4662  // Lock should be held.
  4663  func (fs *fileStore) cancelAgeChk() {
  4664  	if fs.ageChk != nil {
  4665  		fs.ageChk.Stop()
  4666  		fs.ageChk = nil
  4667  	}
  4668  }
  4669  
  4670  // Will expire msgs that are too old.
  4671  func (fs *fileStore) expireMsgs() {
  4672  	// We need to delete one by one here and can not optimize for the time being.
  4673  	// Reason is that we need more information to adjust ack pending in consumers.
  4674  	var smv StoreMsg
  4675  	var sm *StoreMsg
  4676  	fs.mu.RLock()
  4677  	maxAge := int64(fs.cfg.MaxAge)
  4678  	minAge := time.Now().UnixNano() - maxAge
  4679  	fs.mu.RUnlock()
  4680  
  4681  	for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) {
  4682  		fs.mu.Lock()
  4683  		fs.removeMsgViaLimits(sm.seq)
  4684  		fs.mu.Unlock()
  4685  		// Recalculate in case we are expiring a bunch.
  4686  		minAge = time.Now().UnixNano() - maxAge
  4687  	}
  4688  
  4689  	fs.mu.Lock()
  4690  	defer fs.mu.Unlock()
  4691  
  4692  	// Onky cancel if no message left, not on potential lookup error that would result in sm == nil.
  4693  	if fs.state.Msgs == 0 {
  4694  		fs.cancelAgeChk()
  4695  	} else {
  4696  		if sm == nil {
  4697  			fs.resetAgeChk(0)
  4698  		} else {
  4699  			fs.resetAgeChk(sm.ts - minAge)
  4700  		}
  4701  	}
  4702  }
  4703  
  4704  // Lock should be held.
  4705  func (fs *fileStore) checkAndFlushAllBlocks() {
  4706  	for _, mb := range fs.blks {
  4707  		if mb.pendingWriteSize() > 0 {
  4708  			// Since fs lock is held need to pull this apart in case we need to rebuild state.
  4709  			mb.mu.Lock()
  4710  			ld, _ := mb.flushPendingMsgsLocked()
  4711  			mb.mu.Unlock()
  4712  			if ld != nil {
  4713  				fs.rebuildStateLocked(ld)
  4714  			}
  4715  		}
  4716  	}
  4717  }
  4718  
  4719  // This will check all the checksums on messages and report back any sequence numbers with errors.
  4720  func (fs *fileStore) checkMsgs() *LostStreamData {
  4721  	fs.mu.Lock()
  4722  	defer fs.mu.Unlock()
  4723  
  4724  	fs.checkAndFlushAllBlocks()
  4725  
  4726  	// Clear any global subject state.
  4727  	fs.psim, fs.tsl = fs.psim.Empty(), 0
  4728  
  4729  	for _, mb := range fs.blks {
  4730  		// Make sure encryption loaded if needed for the block.
  4731  		fs.loadEncryptionForMsgBlock(mb)
  4732  		// FIXME(dlc) - check tombstones here too?
  4733  		if ld, _, err := mb.rebuildState(); err != nil && ld != nil {
  4734  			// Rebuild fs state too.
  4735  			fs.rebuildStateLocked(ld)
  4736  		}
  4737  		fs.populateGlobalPerSubjectInfo(mb)
  4738  	}
  4739  
  4740  	return fs.ld
  4741  }
  4742  
  4743  // Lock should be held.
  4744  func (mb *msgBlock) enableForWriting(fip bool) error {
  4745  	if mb == nil {
  4746  		return errNoMsgBlk
  4747  	}
  4748  	if mb.mfd != nil {
  4749  		return nil
  4750  	}
  4751  	<-dios
  4752  	mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
  4753  	dios <- struct{}{}
  4754  	if err != nil {
  4755  		return fmt.Errorf("error opening msg block file [%q]: %v", mb.mfn, err)
  4756  	}
  4757  	mb.mfd = mfd
  4758  
  4759  	// Spin up our flusher loop if needed.
  4760  	if !fip {
  4761  		mb.spinUpFlushLoop()
  4762  	}
  4763  
  4764  	return nil
  4765  }
  4766  
  4767  // Helper function to place a delete tombstone.
  4768  func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error {
  4769  	return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true)
  4770  }
  4771  
  4772  // Will write the message record to the underlying message block.
  4773  // filestore lock will be held.
  4774  func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error {
  4775  	mb.mu.Lock()
  4776  	defer mb.mu.Unlock()
  4777  
  4778  	// Enable for writing if our mfd is not open.
  4779  	if mb.mfd == nil {
  4780  		if err := mb.enableForWriting(flush); err != nil {
  4781  			return err
  4782  		}
  4783  	}
  4784  
  4785  	// Make sure we have a cache setup.
  4786  	if mb.cache == nil {
  4787  		mb.setupWriteCache(nil)
  4788  	}
  4789  
  4790  	// Check if we are tracking per subject for our simple state.
  4791  	// Do this before changing the cache that would trigger a flush pending msgs call
  4792  	// if we needed to regenerate the per subject info.
  4793  	// Note that tombstones have no subject so will not trigger here.
  4794  	if len(subj) > 0 && !mb.noTrack {
  4795  		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
  4796  			return err
  4797  		}
  4798  		if ss := mb.fss[subj]; ss != nil {
  4799  			ss.Msgs++
  4800  			ss.Last = seq
  4801  		} else {
  4802  			mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
  4803  		}
  4804  	}
  4805  
  4806  	// Indexing
  4807  	index := len(mb.cache.buf) + int(mb.cache.off)
  4808  
  4809  	// Formats
  4810  	// Format with no header
  4811  	// total_len(4) sequence(8) timestamp(8) subj_len(2) subj msg hash(8)
  4812  	// With headers, high bit on total length will be set.
  4813  	// total_len(4) sequence(8) timestamp(8) subj_len(2) subj hdr_len(4) hdr msg hash(8)
  4814  
  4815  	// First write header, etc.
  4816  	var le = binary.LittleEndian
  4817  	var hdr [msgHdrSize]byte
  4818  
  4819  	l := uint32(rl)
  4820  	hasHeaders := len(mhdr) > 0
  4821  	if hasHeaders {
  4822  		l |= hbit
  4823  	}
  4824  
  4825  	le.PutUint32(hdr[0:], l)
  4826  	le.PutUint64(hdr[4:], seq)
  4827  	le.PutUint64(hdr[12:], uint64(ts))
  4828  	le.PutUint16(hdr[20:], uint16(len(subj)))
  4829  
  4830  	// Now write to underlying buffer.
  4831  	mb.cache.buf = append(mb.cache.buf, hdr[:]...)
  4832  	mb.cache.buf = append(mb.cache.buf, subj...)
  4833  
  4834  	if hasHeaders {
  4835  		var hlen [4]byte
  4836  		le.PutUint32(hlen[0:], uint32(len(mhdr)))
  4837  		mb.cache.buf = append(mb.cache.buf, hlen[:]...)
  4838  		mb.cache.buf = append(mb.cache.buf, mhdr...)
  4839  	}
  4840  	mb.cache.buf = append(mb.cache.buf, msg...)
  4841  
  4842  	// Calculate hash.
  4843  	mb.hh.Reset()
  4844  	mb.hh.Write(hdr[4:20])
  4845  	mb.hh.Write([]byte(subj))
  4846  	if hasHeaders {
  4847  		mb.hh.Write(mhdr)
  4848  	}
  4849  	mb.hh.Write(msg)
  4850  	checksum := mb.hh.Sum(nil)
  4851  	// Grab last checksum
  4852  	copy(mb.lchk[0:], checksum)
  4853  
  4854  	// Update write through cache.
  4855  	// Write to msg record.
  4856  	mb.cache.buf = append(mb.cache.buf, checksum...)
  4857  	mb.cache.lrl = uint32(rl)
  4858  
  4859  	// Set cache timestamp for last store.
  4860  	mb.lwts = ts
  4861  
  4862  	// Only update index and do accounting if not a delete tombstone.
  4863  	if seq&tbit == 0 {
  4864  		// Accounting, do this before stripping ebit, it is ebit aware.
  4865  		mb.updateAccounting(seq, ts, rl)
  4866  		// Strip ebit if set.
  4867  		seq = seq &^ ebit
  4868  		if mb.cache.fseq == 0 {
  4869  			mb.cache.fseq = seq
  4870  		}
  4871  		// Write index
  4872  		mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit)
  4873  	}
  4874  
  4875  	fch, werr := mb.fch, mb.werr
  4876  
  4877  	// If we should be flushing, or had a write error, do so here.
  4878  	if flush || werr != nil {
  4879  		ld, err := mb.flushPendingMsgsLocked()
  4880  		if ld != nil && mb.fs != nil {
  4881  			// We have the mb lock here, this needs the mb locks so do in its own go routine.
  4882  			go mb.fs.rebuildState(ld)
  4883  		}
  4884  		if err != nil {
  4885  			return err
  4886  		}
  4887  	} else {
  4888  		// Kick the flusher here.
  4889  		kickFlusher(fch)
  4890  	}
  4891  
  4892  	return nil
  4893  }
  4894  
  4895  // How many bytes pending to be written for this message block.
  4896  func (mb *msgBlock) pendingWriteSize() int {
  4897  	if mb == nil {
  4898  		return 0
  4899  	}
  4900  	mb.mu.RLock()
  4901  	defer mb.mu.RUnlock()
  4902  	return mb.pendingWriteSizeLocked()
  4903  }
  4904  
  4905  // How many bytes pending to be written for this message block.
  4906  func (mb *msgBlock) pendingWriteSizeLocked() int {
  4907  	if mb == nil {
  4908  		return 0
  4909  	}
  4910  	var pending int
  4911  	if !mb.closed && mb.mfd != nil && mb.cache != nil {
  4912  		pending = len(mb.cache.buf) - int(mb.cache.wp)
  4913  	}
  4914  	return pending
  4915  }
  4916  
  4917  // Try to close our FDs if we can.
  4918  func (mb *msgBlock) closeFDs() error {
  4919  	mb.mu.Lock()
  4920  	defer mb.mu.Unlock()
  4921  	return mb.closeFDsLocked()
  4922  }
  4923  
  4924  func (mb *msgBlock) closeFDsLocked() error {
  4925  	if buf, _ := mb.bytesPending(); len(buf) > 0 {
  4926  		return errPendingData
  4927  	}
  4928  	mb.closeFDsLockedNoCheck()
  4929  	return nil
  4930  }
  4931  
  4932  func (mb *msgBlock) closeFDsLockedNoCheck() {
  4933  	if mb.mfd != nil {
  4934  		mb.mfd.Close()
  4935  		mb.mfd = nil
  4936  	}
  4937  }
  4938  
  4939  // bytesPending returns the buffer to be used for writing to the underlying file.
  4940  // This marks we are in flush and will return nil if asked again until cleared.
  4941  // Lock should be held.
  4942  func (mb *msgBlock) bytesPending() ([]byte, error) {
  4943  	if mb == nil || mb.mfd == nil {
  4944  		return nil, errNoPending
  4945  	}
  4946  	if mb.cache == nil {
  4947  		return nil, errNoCache
  4948  	}
  4949  	if len(mb.cache.buf) <= mb.cache.wp {
  4950  		return nil, errNoPending
  4951  	}
  4952  	buf := mb.cache.buf[mb.cache.wp:]
  4953  	if len(buf) == 0 {
  4954  		return nil, errNoPending
  4955  	}
  4956  	return buf, nil
  4957  }
  4958  
  4959  // Returns the current blkSize including deleted msgs etc.
  4960  func (mb *msgBlock) blkSize() uint64 {
  4961  	mb.mu.RLock()
  4962  	nb := mb.rbytes
  4963  	mb.mu.RUnlock()
  4964  	return nb
  4965  }
  4966  
  4967  // Update accounting on a write msg.
  4968  // Lock should be held.
  4969  func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) {
  4970  	isDeleted := seq&ebit != 0
  4971  	if isDeleted {
  4972  		seq = seq &^ ebit
  4973  	}
  4974  
  4975  	fseq := atomic.LoadUint64(&mb.first.seq)
  4976  	if (fseq == 0 || mb.first.ts == 0) && seq >= fseq {
  4977  		atomic.StoreUint64(&mb.first.seq, seq)
  4978  		mb.first.ts = ts
  4979  	}
  4980  	// Need atomics here for selectMsgBlock speed.
  4981  	atomic.StoreUint64(&mb.last.seq, seq)
  4982  	mb.last.ts = ts
  4983  	mb.rbytes += rl
  4984  	if !isDeleted {
  4985  		mb.bytes += rl
  4986  		mb.msgs++
  4987  	}
  4988  }
  4989  
  4990  // Lock should be held.
  4991  func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg []byte) (uint64, error) {
  4992  	var err error
  4993  
  4994  	// Get size for this message.
  4995  	rl := fileStoreMsgSize(subj, hdr, msg)
  4996  	if rl&hbit != 0 {
  4997  		return 0, ErrMsgTooLarge
  4998  	}
  4999  	// Grab our current last message block.
  5000  	mb := fs.lmb
  5001  
  5002  	// Mark as dirty for stream state.
  5003  	fs.dirty++
  5004  
  5005  	if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize {
  5006  		if mb != nil && fs.fcfg.Compression != NoCompression {
  5007  			// We've now reached the end of this message block, if we want
  5008  			// to compress blocks then now's the time to do it.
  5009  			go mb.recompressOnDiskIfNeeded()
  5010  		}
  5011  		if mb, err = fs.newMsgBlockForWrite(); err != nil {
  5012  			return 0, err
  5013  		}
  5014  	}
  5015  
  5016  	// Ask msg block to store in write through cache.
  5017  	err = mb.writeMsgRecord(rl, seq, subj, hdr, msg, ts, fs.fip)
  5018  
  5019  	return rl, err
  5020  }
  5021  
  5022  func (mb *msgBlock) recompressOnDiskIfNeeded() error {
  5023  	alg := mb.fs.fcfg.Compression
  5024  	mb.mu.Lock()
  5025  	defer mb.mu.Unlock()
  5026  
  5027  	origFN := mb.mfn                    // The original message block on disk.
  5028  	tmpFN := mb.mfn + compressTmpSuffix // The compressed block will be written here.
  5029  
  5030  	// Open up the file block and read in the entire contents into memory.
  5031  	// One of two things will happen:
  5032  	// 1. The block will be compressed already and have a valid metadata
  5033  	//    header, in which case we do nothing.
  5034  	// 2. The block will be uncompressed, in which case we will compress it
  5035  	//    and then write it back out to disk, reencrypting if necessary.
  5036  	<-dios
  5037  	origBuf, err := os.ReadFile(origFN)
  5038  	dios <- struct{}{}
  5039  
  5040  	if err != nil {
  5041  		return fmt.Errorf("failed to read original block from disk: %w", err)
  5042  	}
  5043  
  5044  	// If the block is encrypted then we will need to decrypt it before
  5045  	// doing anything. We always encrypt after compressing because then the
  5046  	// compression can be as efficient as possible on the raw data, whereas
  5047  	// the encrypted ciphertext will not compress anywhere near as well.
  5048  	// The block encryption also covers the optional compression metadata.
  5049  	if mb.bek != nil && len(origBuf) > 0 {
  5050  		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  5051  		if err != nil {
  5052  			return err
  5053  		}
  5054  		mb.bek = bek
  5055  		mb.bek.XORKeyStream(origBuf, origBuf)
  5056  	}
  5057  
  5058  	meta := &CompressionInfo{}
  5059  	if _, err := meta.UnmarshalMetadata(origBuf); err != nil {
  5060  		// An error is only returned here if there's a problem with parsing
  5061  		// the metadata. If the file has no metadata at all, no error is
  5062  		// returned and the algorithm defaults to no compression.
  5063  		return fmt.Errorf("failed to read existing metadata header: %w", err)
  5064  	}
  5065  	if meta.Algorithm == alg {
  5066  		// The block is already compressed with the chosen algorithm so there
  5067  		// is nothing else to do. This is not a common case, it is here only
  5068  		// to ensure we don't do unnecessary work in case something asked us
  5069  		// to recompress an already compressed block with the same algorithm.
  5070  		return nil
  5071  	} else if alg != NoCompression {
  5072  		// The block is already compressed using some algorithm, so we need
  5073  		// to decompress the block using the existing algorithm before we can
  5074  		// recompress it with the new one.
  5075  		if origBuf, err = meta.Algorithm.Decompress(origBuf); err != nil {
  5076  			return fmt.Errorf("failed to decompress original block: %w", err)
  5077  		}
  5078  	}
  5079  
  5080  	// Rather than modifying the existing block on disk (which is a dangerous
  5081  	// operation if something goes wrong), create a new temporary file. We will
  5082  	// write out the new block here and then swap the files around afterwards
  5083  	// once everything else has succeeded correctly.
  5084  	<-dios
  5085  	tmpFD, err := os.OpenFile(tmpFN, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, defaultFilePerms)
  5086  	dios <- struct{}{}
  5087  	if err != nil {
  5088  		return fmt.Errorf("failed to create temporary file: %w", err)
  5089  	}
  5090  
  5091  	// The original buffer at this point is uncompressed, so we will now compress
  5092  	// it if needed. Note that if the selected algorithm is NoCompression, the
  5093  	// Compress function will just return the input buffer unmodified.
  5094  	cmpBuf, err := alg.Compress(origBuf)
  5095  	if err != nil {
  5096  		return fmt.Errorf("failed to compress block: %w", err)
  5097  	}
  5098  
  5099  	// We only need to write out the metadata header if compression is enabled.
  5100  	// If we're trying to uncompress the file on disk at this point, don't bother
  5101  	// writing metadata.
  5102  	if alg != NoCompression {
  5103  		meta := &CompressionInfo{
  5104  			Algorithm:    alg,
  5105  			OriginalSize: uint64(len(origBuf)),
  5106  		}
  5107  		cmpBuf = append(meta.MarshalMetadata(), cmpBuf...)
  5108  	}
  5109  
  5110  	// Re-encrypt the block if necessary.
  5111  	if mb.bek != nil && len(cmpBuf) > 0 {
  5112  		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  5113  		if err != nil {
  5114  			return err
  5115  		}
  5116  		mb.bek = bek
  5117  		mb.bek.XORKeyStream(cmpBuf, cmpBuf)
  5118  	}
  5119  
  5120  	// Write the new block data (which might be compressed or encrypted) to the
  5121  	// temporary file.
  5122  	errorCleanup := func(err error) error {
  5123  		tmpFD.Close()
  5124  		os.Remove(tmpFN)
  5125  		return err
  5126  	}
  5127  	if n, err := tmpFD.Write(cmpBuf); err != nil {
  5128  		return errorCleanup(fmt.Errorf("failed to write to temporary file: %w", err))
  5129  	} else if n != len(cmpBuf) {
  5130  		return errorCleanup(fmt.Errorf("short write to temporary file (%d != %d)", n, len(cmpBuf)))
  5131  	}
  5132  	if err := tmpFD.Sync(); err != nil {
  5133  		return errorCleanup(fmt.Errorf("failed to sync temporary file: %w", err))
  5134  	}
  5135  	if err := tmpFD.Close(); err != nil {
  5136  		return errorCleanup(fmt.Errorf("failed to close temporary file: %w", err))
  5137  	}
  5138  
  5139  	// Now replace the original file with the newly updated temp file.
  5140  	if err := os.Rename(tmpFN, origFN); err != nil {
  5141  		return fmt.Errorf("failed to move temporary file into place: %w", err)
  5142  	}
  5143  
  5144  	// Since the message block might be retained in memory, make sure the
  5145  	// compression algorithm is up-to-date, since this will be needed when
  5146  	// compacting or truncating.
  5147  	mb.cmp = alg
  5148  	return nil
  5149  }
  5150  
  5151  func (mb *msgBlock) decompressIfNeeded(buf []byte) ([]byte, error) {
  5152  	var meta CompressionInfo
  5153  	if n, err := meta.UnmarshalMetadata(buf); err != nil {
  5154  		// There was a problem parsing the metadata header of the block.
  5155  		// If there's no metadata header, an error isn't returned here,
  5156  		// we will instead just use default values of no compression.
  5157  		return nil, err
  5158  	} else if n == 0 {
  5159  		// There were no metadata bytes, so we assume the block is not
  5160  		// compressed and return it as-is.
  5161  		return buf, nil
  5162  	} else {
  5163  		// Metadata was present so it's quite likely the block contents
  5164  		// are compressed. If by any chance the metadata claims that the
  5165  		// block is uncompressed, then the input slice is just returned
  5166  		// unmodified.
  5167  		return meta.Algorithm.Decompress(buf[n:])
  5168  	}
  5169  }
  5170  
  5171  // Lock should be held.
  5172  func (mb *msgBlock) ensureRawBytesLoaded() error {
  5173  	if mb.rbytes > 0 {
  5174  		return nil
  5175  	}
  5176  	f, err := mb.openBlock()
  5177  	if err != nil {
  5178  		return err
  5179  	}
  5180  	defer f.Close()
  5181  	if fi, err := f.Stat(); fi != nil && err == nil {
  5182  		mb.rbytes = uint64(fi.Size())
  5183  	} else {
  5184  		return err
  5185  	}
  5186  	return nil
  5187  }
  5188  
  5189  // Sync msg and index files as needed. This is called from a timer.
  5190  func (fs *fileStore) syncBlocks() {
  5191  	fs.mu.RLock()
  5192  	if fs.closed {
  5193  		fs.mu.RUnlock()
  5194  		return
  5195  	}
  5196  	blks := append([]*msgBlock(nil), fs.blks...)
  5197  	lmb := fs.lmb
  5198  	syncInterval := fs.fcfg.SyncInterval
  5199  	fs.mu.RUnlock()
  5200  
  5201  	var markDirty bool
  5202  	for _, mb := range blks {
  5203  		// Do actual sync. Hold lock for consistency.
  5204  		mb.mu.Lock()
  5205  		if mb.closed {
  5206  			mb.mu.Unlock()
  5207  			continue
  5208  		}
  5209  		// See if we can close FDs due to being idle.
  5210  		if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle {
  5211  			mb.dirtyCloseWithRemove(false)
  5212  		}
  5213  		// Check our fss subject metadata.
  5214  		// If we have no activity within sync interval remove.
  5215  		if mb.fssLoaded() && mb.sinceLastActivity() > syncInterval {
  5216  			mb.fss = nil
  5217  		}
  5218  
  5219  		// Check if we should compact here as well.
  5220  		// Do not compact last mb.
  5221  		var needsCompact bool
  5222  		if mb != lmb && mb.ensureRawBytesLoaded() == nil && mb.rbytes > mb.bytes {
  5223  			needsCompact = true
  5224  			markDirty = true
  5225  		}
  5226  
  5227  		// Check if we need to sync. We will not hold lock during actual sync.
  5228  		needSync := mb.needSync
  5229  		if needSync {
  5230  			// Flush anything that may be pending.
  5231  			mb.flushPendingMsgsLocked()
  5232  		}
  5233  		mb.mu.Unlock()
  5234  
  5235  		// Check if we should compact here.
  5236  		// Need to hold fs lock in case we reference psim when loading in the mb.
  5237  		if needsCompact {
  5238  			fs.mu.RLock()
  5239  			mb.mu.Lock()
  5240  			mb.compact()
  5241  			mb.mu.Unlock()
  5242  			fs.mu.RUnlock()
  5243  		}
  5244  
  5245  		// Check if we need to sync this block.
  5246  		if needSync {
  5247  			mb.mu.Lock()
  5248  			var fd *os.File
  5249  			var didOpen bool
  5250  			if mb.mfd != nil {
  5251  				fd = mb.mfd
  5252  			} else {
  5253  				<-dios
  5254  				fd, _ = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
  5255  				dios <- struct{}{}
  5256  				didOpen = true
  5257  			}
  5258  			// If we have an fd.
  5259  			if fd != nil {
  5260  				canClear := fd.Sync() == nil
  5261  				// If we opened the file close the fd.
  5262  				if didOpen {
  5263  					fd.Close()
  5264  				}
  5265  				// Only clear sync flag on success.
  5266  				if canClear {
  5267  					mb.needSync = false
  5268  				}
  5269  			}
  5270  			mb.mu.Unlock()
  5271  		}
  5272  	}
  5273  
  5274  	fs.mu.Lock()
  5275  	if fs.closed {
  5276  		fs.mu.Unlock()
  5277  		return
  5278  	}
  5279  	fs.setSyncTimer()
  5280  	if markDirty {
  5281  		fs.dirty++
  5282  	}
  5283  
  5284  	// Sync state file if we are not running with sync always.
  5285  	if !fs.fcfg.SyncAlways {
  5286  		fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
  5287  		<-dios
  5288  		fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms)
  5289  		dios <- struct{}{}
  5290  		if fd != nil {
  5291  			fd.Sync()
  5292  			fd.Close()
  5293  		}
  5294  	}
  5295  	fs.mu.Unlock()
  5296  }
  5297  
  5298  // Select the message block where this message should be found.
  5299  // Return nil if not in the set.
  5300  // Read lock should be held.
  5301  func (fs *fileStore) selectMsgBlock(seq uint64) *msgBlock {
  5302  	_, mb := fs.selectMsgBlockWithIndex(seq)
  5303  	return mb
  5304  }
  5305  
  5306  // Lock should be held.
  5307  func (fs *fileStore) selectMsgBlockWithIndex(seq uint64) (int, *msgBlock) {
  5308  	// Check for out of range.
  5309  	if seq < fs.state.FirstSeq || seq > fs.state.LastSeq || fs.state.Msgs == 0 {
  5310  		return -1, nil
  5311  	}
  5312  
  5313  	const linearThresh = 32
  5314  	nb := len(fs.blks) - 1
  5315  
  5316  	if nb < linearThresh {
  5317  		for i, mb := range fs.blks {
  5318  			if seq <= atomic.LoadUint64(&mb.last.seq) {
  5319  				return i, mb
  5320  			}
  5321  		}
  5322  		return -1, nil
  5323  	}
  5324  
  5325  	// Do traditional binary search here since we know the blocks are sorted by sequence first and last.
  5326  	for low, high, mid := 0, nb, nb/2; low <= high; mid = (low + high) / 2 {
  5327  		mb := fs.blks[mid]
  5328  		// Right now these atomic loads do not factor in, so fine to leave. Was considering
  5329  		// uplifting these to fs scope to avoid atomic load but not needed.
  5330  		first, last := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  5331  		if seq > last {
  5332  			low = mid + 1
  5333  		} else if seq < first {
  5334  			// A message block's first sequence can change here meaning we could find a gap.
  5335  			// We want to behave like above, which if inclusive (we check at start) should
  5336  			// always return an index and a valid mb.
  5337  			// If we have a gap then our seq would be > fs.blks[mid-1].last.seq
  5338  			if mid == 0 || seq > atomic.LoadUint64(&fs.blks[mid-1].last.seq) {
  5339  				return mid, mb
  5340  			}
  5341  			high = mid - 1
  5342  		} else {
  5343  			return mid, mb
  5344  		}
  5345  	}
  5346  
  5347  	return -1, nil
  5348  }
  5349  
  5350  // Select the message block where this message should be found.
  5351  // Return nil if not in the set.
  5352  func (fs *fileStore) selectMsgBlockForStart(minTime time.Time) *msgBlock {
  5353  	fs.mu.RLock()
  5354  	defer fs.mu.RUnlock()
  5355  
  5356  	t := minTime.UnixNano()
  5357  	for _, mb := range fs.blks {
  5358  		mb.mu.RLock()
  5359  		found := t <= mb.last.ts
  5360  		mb.mu.RUnlock()
  5361  		if found {
  5362  			return mb
  5363  		}
  5364  	}
  5365  	return nil
  5366  }
  5367  
  5368  // Index a raw msg buffer.
  5369  // Lock should be held.
  5370  func (mb *msgBlock) indexCacheBuf(buf []byte) error {
  5371  	var le = binary.LittleEndian
  5372  
  5373  	var fseq uint64
  5374  	var idx []uint32
  5375  	var index uint32
  5376  
  5377  	mbFirstSeq := atomic.LoadUint64(&mb.first.seq)
  5378  	mbLastSeq := atomic.LoadUint64(&mb.last.seq)
  5379  
  5380  	// Sanity check here since we calculate size to allocate based on this.
  5381  	if mbFirstSeq > (mbLastSeq + 1) { // Purged state first == last + 1
  5382  		mb.fs.warn("indexCacheBuf corrupt state: mb.first %d mb.last %d", mbFirstSeq, mbLastSeq)
  5383  		// This would cause idxSz to wrap.
  5384  		return errCorruptState
  5385  	}
  5386  
  5387  	// Capture beginning size of dmap.
  5388  	dms := uint64(mb.dmap.Size())
  5389  	idxSz := mbLastSeq - mbFirstSeq + 1
  5390  
  5391  	if mb.cache == nil {
  5392  		// Approximation, may adjust below.
  5393  		fseq = mbFirstSeq
  5394  		idx = make([]uint32, 0, idxSz)
  5395  		mb.cache = &cache{}
  5396  	} else {
  5397  		fseq = mb.cache.fseq
  5398  		idx = mb.cache.idx
  5399  		if len(idx) == 0 {
  5400  			idx = make([]uint32, 0, idxSz)
  5401  		}
  5402  		index = uint32(len(mb.cache.buf))
  5403  		buf = append(mb.cache.buf, buf...)
  5404  	}
  5405  
  5406  	// Create FSS if we should track.
  5407  	var popFss bool
  5408  	if mb.fssNotLoaded() {
  5409  		mb.fss = make(map[string]*SimpleState)
  5410  		popFss = true
  5411  	}
  5412  
  5413  	lbuf := uint32(len(buf))
  5414  	var seq uint64
  5415  	for index < lbuf {
  5416  		if index+msgHdrSize > lbuf {
  5417  			return errCorruptState
  5418  		}
  5419  		hdr := buf[index : index+msgHdrSize]
  5420  		rl, slen := le.Uint32(hdr[0:]), int(le.Uint16(hdr[20:]))
  5421  		seq = le.Uint64(hdr[4:])
  5422  
  5423  		// Clear any headers bit that could be set.
  5424  		rl &^= hbit
  5425  		dlen := int(rl) - msgHdrSize
  5426  
  5427  		// Do some quick sanity checks here.
  5428  		if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
  5429  			mb.fs.warn("indexCacheBuf corrupt record state: dlen %d slen %d index %d rl %d lbuf %d", dlen, slen, index, rl, lbuf)
  5430  			// This means something is off.
  5431  			// TODO(dlc) - Add into bad list?
  5432  			return errCorruptState
  5433  		}
  5434  
  5435  		// Check for tombstones which we can skip in terms of indexing.
  5436  		if seq&tbit != 0 {
  5437  			index += rl
  5438  			continue
  5439  		}
  5440  
  5441  		// Clear any erase bits.
  5442  		erased := seq&ebit != 0
  5443  		seq = seq &^ ebit
  5444  
  5445  		// We defer checksum checks to individual msg cache lookups to amortorize costs and
  5446  		// not introduce latency for first message from a newly loaded block.
  5447  		if seq >= mbFirstSeq {
  5448  			// Track that we do not have holes.
  5449  			if slot := int(seq - mbFirstSeq); slot != len(idx) {
  5450  				// If we have a hole fill it.
  5451  				for dseq := mbFirstSeq + uint64(len(idx)); dseq < seq; dseq++ {
  5452  					idx = append(idx, dbit)
  5453  					if dms == 0 {
  5454  						mb.dmap.Insert(dseq)
  5455  					}
  5456  				}
  5457  			}
  5458  			// Add to our index.
  5459  			idx = append(idx, index)
  5460  			mb.cache.lrl = uint32(rl)
  5461  			// Adjust if we guessed wrong.
  5462  			if seq != 0 && seq < fseq {
  5463  				fseq = seq
  5464  			}
  5465  
  5466  			// Make sure our dmap has this entry if it was erased.
  5467  			if erased && dms == 0 {
  5468  				mb.dmap.Insert(seq)
  5469  			}
  5470  
  5471  			// Handle FSS inline here.
  5472  			if popFss && slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) {
  5473  				bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)]
  5474  				if ss := mb.fss[string(bsubj)]; ss != nil {
  5475  					ss.Msgs++
  5476  					ss.Last = seq
  5477  				} else {
  5478  					mb.fss[string(bsubj)] = &SimpleState{
  5479  						Msgs:  1,
  5480  						First: seq,
  5481  						Last:  seq,
  5482  					}
  5483  				}
  5484  			}
  5485  		}
  5486  		index += rl
  5487  	}
  5488  
  5489  	// Track holes at the end of the block, these would be missed in the
  5490  	// earlier loop if we've ran out of block file to look at, but should
  5491  	// be easily noticed because the seq will be below the last seq from
  5492  	// the index.
  5493  	if seq > 0 && seq < mbLastSeq {
  5494  		for dseq := seq; dseq < mbLastSeq; dseq++ {
  5495  			idx = append(idx, dbit)
  5496  			if dms == 0 {
  5497  				mb.dmap.Insert(dseq)
  5498  			}
  5499  		}
  5500  	}
  5501  
  5502  	mb.cache.buf = buf
  5503  	mb.cache.idx = idx
  5504  	mb.cache.fseq = fseq
  5505  	mb.cache.wp += int(lbuf)
  5506  
  5507  	return nil
  5508  }
  5509  
  5510  // flushPendingMsgs writes out any messages for this message block.
  5511  func (mb *msgBlock) flushPendingMsgs() error {
  5512  	mb.mu.Lock()
  5513  	fsLostData, err := mb.flushPendingMsgsLocked()
  5514  	fs := mb.fs
  5515  	mb.mu.Unlock()
  5516  
  5517  	// Signals us that we need to rebuild filestore state.
  5518  	if fsLostData != nil && fs != nil {
  5519  		// Rebuild fs state too.
  5520  		fs.rebuildState(fsLostData)
  5521  	}
  5522  	return err
  5523  }
  5524  
  5525  // Write function for actual data.
  5526  // mb.mfd should not be nil.
  5527  // Lock should held.
  5528  func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) {
  5529  	// Used to mock write failures.
  5530  	if mb.mockWriteErr {
  5531  		// Reset on trip.
  5532  		mb.mockWriteErr = false
  5533  		return 0, errors.New("mock write error")
  5534  	}
  5535  	<-dios
  5536  	n, err := mb.mfd.WriteAt(buf, woff)
  5537  	dios <- struct{}{}
  5538  	return n, err
  5539  }
  5540  
  5541  // flushPendingMsgsLocked writes out any messages for this message block.
  5542  // Lock should be held.
  5543  func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) {
  5544  	// Signals us that we need to rebuild filestore state.
  5545  	var fsLostData *LostStreamData
  5546  
  5547  	if mb.cache == nil || mb.mfd == nil {
  5548  		return nil, nil
  5549  	}
  5550  
  5551  	buf, err := mb.bytesPending()
  5552  	// If we got an error back return here.
  5553  	if err != nil {
  5554  		// No pending data to be written is not an error.
  5555  		if err == errNoPending || err == errNoCache {
  5556  			err = nil
  5557  		}
  5558  		return nil, err
  5559  	}
  5560  
  5561  	woff := int64(mb.cache.off + mb.cache.wp)
  5562  	lob := len(buf)
  5563  
  5564  	// TODO(dlc) - Normally we would not hold the lock across I/O so we can improve performance.
  5565  	// We will hold to stabilize the code base, as we have had a few anomalies with partial cache errors
  5566  	// under heavy load.
  5567  
  5568  	// Check if we need to encrypt.
  5569  	if mb.bek != nil && lob > 0 {
  5570  		// Need to leave original alone.
  5571  		var dst []byte
  5572  		if lob <= defaultLargeBlockSize {
  5573  			dst = getMsgBlockBuf(lob)[:lob]
  5574  		} else {
  5575  			dst = make([]byte, lob)
  5576  		}
  5577  		mb.bek.XORKeyStream(dst, buf)
  5578  		buf = dst
  5579  	}
  5580  
  5581  	// Append new data to the message block file.
  5582  	for lbb := lob; lbb > 0; lbb = len(buf) {
  5583  		n, err := mb.writeAt(buf, woff)
  5584  		if err != nil {
  5585  			mb.dirtyCloseWithRemove(false)
  5586  			ld, _, _ := mb.rebuildStateLocked()
  5587  			mb.werr = err
  5588  			return ld, err
  5589  		}
  5590  		// Update our write offset.
  5591  		woff += int64(n)
  5592  		// Partial write.
  5593  		if n != lbb {
  5594  			buf = buf[n:]
  5595  		} else {
  5596  			// Done.
  5597  			break
  5598  		}
  5599  	}
  5600  
  5601  	// Clear any error.
  5602  	mb.werr = nil
  5603  
  5604  	// Cache may be gone.
  5605  	if mb.cache == nil || mb.mfd == nil {
  5606  		return fsLostData, mb.werr
  5607  	}
  5608  
  5609  	// Check if we are in sync always mode.
  5610  	if mb.syncAlways {
  5611  		mb.mfd.Sync()
  5612  	} else {
  5613  		mb.needSync = true
  5614  	}
  5615  
  5616  	// Check for additional writes while we were writing to the disk.
  5617  	moreBytes := len(mb.cache.buf) - mb.cache.wp - lob
  5618  
  5619  	// Decide what we want to do with the buffer in hand. If we have load interest
  5620  	// we will hold onto the whole thing, otherwise empty the buffer, possibly reusing it.
  5621  	if ts := time.Now().UnixNano(); ts < mb.llts || (ts-mb.llts) <= int64(mb.cexp) {
  5622  		mb.cache.wp += lob
  5623  	} else {
  5624  		if cap(mb.cache.buf) <= maxBufReuse {
  5625  			buf = mb.cache.buf[:0]
  5626  		} else {
  5627  			recycleMsgBlockBuf(mb.cache.buf)
  5628  			buf = nil
  5629  		}
  5630  		if moreBytes > 0 {
  5631  			nbuf := mb.cache.buf[len(mb.cache.buf)-moreBytes:]
  5632  			if moreBytes > (len(mb.cache.buf)/4*3) && cap(nbuf) <= maxBufReuse {
  5633  				buf = nbuf
  5634  			} else {
  5635  				buf = append(buf, nbuf...)
  5636  			}
  5637  		}
  5638  		// Update our cache offset.
  5639  		mb.cache.off = int(woff)
  5640  		// Reset write pointer.
  5641  		mb.cache.wp = 0
  5642  		// Place buffer back in the cache structure.
  5643  		mb.cache.buf = buf
  5644  		// Mark fseq to 0
  5645  		mb.cache.fseq = 0
  5646  	}
  5647  
  5648  	return fsLostData, mb.werr
  5649  }
  5650  
  5651  // Lock should be held.
  5652  func (mb *msgBlock) clearLoading() {
  5653  	mb.loading = false
  5654  }
  5655  
  5656  // Will load msgs from disk.
  5657  func (mb *msgBlock) loadMsgs() error {
  5658  	// We hold the lock here the whole time by design.
  5659  	mb.mu.Lock()
  5660  	defer mb.mu.Unlock()
  5661  	return mb.loadMsgsWithLock()
  5662  }
  5663  
  5664  // Lock should be held.
  5665  func (mb *msgBlock) cacheAlreadyLoaded() bool {
  5666  	if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 {
  5667  		return false
  5668  	}
  5669  	numEntries := mb.msgs + uint64(mb.dmap.Size()) + (atomic.LoadUint64(&mb.first.seq) - mb.cache.fseq)
  5670  	return numEntries == uint64(len(mb.cache.idx))
  5671  }
  5672  
  5673  // Lock should be held.
  5674  func (mb *msgBlock) cacheNotLoaded() bool {
  5675  	return !mb.cacheAlreadyLoaded()
  5676  }
  5677  
  5678  // Report if our fss is not loaded.
  5679  // Lock should be held.
  5680  func (mb *msgBlock) fssNotLoaded() bool {
  5681  	return mb.fss == nil && !mb.noTrack
  5682  }
  5683  
  5684  // Report if we have our fss loaded.
  5685  // Lock should be held.
  5686  func (mb *msgBlock) fssLoaded() bool {
  5687  	return mb.fss != nil
  5688  }
  5689  
  5690  // Wrap openBlock for the gated semaphore processing.
  5691  // Lock should be held
  5692  func (mb *msgBlock) openBlock() (*os.File, error) {
  5693  	// Gate with concurrent IO semaphore.
  5694  	<-dios
  5695  	f, err := os.Open(mb.mfn)
  5696  	dios <- struct{}{}
  5697  	return f, err
  5698  }
  5699  
  5700  // Used to load in the block contents.
  5701  // Lock should be held and all conditionals satisfied prior.
  5702  func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) {
  5703  	var f *os.File
  5704  	// Re-use if we have mfd open.
  5705  	if mb.mfd != nil {
  5706  		f = mb.mfd
  5707  		if n, err := f.Seek(0, 0); n != 0 || err != nil {
  5708  			f = nil
  5709  			mb.closeFDsLockedNoCheck()
  5710  		}
  5711  	}
  5712  	if f == nil {
  5713  		var err error
  5714  		f, err = mb.openBlock()
  5715  		if err != nil {
  5716  			if os.IsNotExist(err) {
  5717  				err = errNoBlkData
  5718  			}
  5719  			return nil, err
  5720  		}
  5721  		defer f.Close()
  5722  	}
  5723  
  5724  	var sz int
  5725  	if info, err := f.Stat(); err == nil {
  5726  		sz64 := info.Size()
  5727  		if int64(int(sz64)) == sz64 {
  5728  			sz = int(sz64)
  5729  		} else {
  5730  			return nil, errMsgBlkTooBig
  5731  		}
  5732  	}
  5733  
  5734  	if buf == nil {
  5735  		buf = getMsgBlockBuf(sz)
  5736  		if sz > cap(buf) {
  5737  			// We know we will make a new one so just recycle for now.
  5738  			recycleMsgBlockBuf(buf)
  5739  			buf = nil
  5740  		}
  5741  	}
  5742  
  5743  	if sz > cap(buf) {
  5744  		buf = make([]byte, sz)
  5745  	} else {
  5746  		buf = buf[:sz]
  5747  	}
  5748  
  5749  	<-dios
  5750  	n, err := io.ReadFull(f, buf)
  5751  	dios <- struct{}{}
  5752  	// On success capture raw bytes size.
  5753  	if err == nil {
  5754  		mb.rbytes = uint64(n)
  5755  	}
  5756  	return buf[:n], err
  5757  }
  5758  
  5759  // Lock should be held.
  5760  func (mb *msgBlock) loadMsgsWithLock() error {
  5761  	// Check for encryption, we do not load keys on startup anymore so might need to load them here.
  5762  	if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
  5763  		if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
  5764  			return err
  5765  		}
  5766  	}
  5767  
  5768  	// Check to see if we are loading already.
  5769  	if mb.loading {
  5770  		return nil
  5771  	}
  5772  
  5773  	// Set loading status.
  5774  	mb.loading = true
  5775  	defer mb.clearLoading()
  5776  
  5777  	var nchecks int
  5778  
  5779  checkCache:
  5780  	nchecks++
  5781  	if nchecks > 8 {
  5782  		return errCorruptState
  5783  	}
  5784  
  5785  	// Check to see if we have a full cache.
  5786  	if mb.cacheAlreadyLoaded() {
  5787  		return nil
  5788  	}
  5789  
  5790  	mb.llts = time.Now().UnixNano()
  5791  
  5792  	// FIXME(dlc) - We could be smarter here.
  5793  	if buf, _ := mb.bytesPending(); len(buf) > 0 {
  5794  		ld, err := mb.flushPendingMsgsLocked()
  5795  		if ld != nil && mb.fs != nil {
  5796  			// We do not know if fs is locked or not at this point.
  5797  			// This should be an exceptional condition so do so in Go routine.
  5798  			go mb.fs.rebuildState(ld)
  5799  		}
  5800  		if err != nil {
  5801  			return err
  5802  		}
  5803  		goto checkCache
  5804  	}
  5805  
  5806  	// Load in the whole block.
  5807  	// We want to hold the mb lock here to avoid any changes to state.
  5808  	buf, err := mb.loadBlock(nil)
  5809  	if err != nil {
  5810  		mb.fs.warn("loadBlock error: ", err)
  5811  		if err == errNoBlkData {
  5812  			if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil {
  5813  				// Rebuild fs state too.
  5814  				go mb.fs.rebuildState(ld)
  5815  			}
  5816  		}
  5817  		return err
  5818  	}
  5819  
  5820  	// Reset the cache since we just read everything in.
  5821  	// Make sure this is cleared in case we had a partial when we started.
  5822  	mb.clearCacheAndOffset()
  5823  
  5824  	// Check if we need to decrypt.
  5825  	if mb.bek != nil && len(buf) > 0 {
  5826  		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  5827  		if err != nil {
  5828  			return err
  5829  		}
  5830  		mb.bek = bek
  5831  		mb.bek.XORKeyStream(buf, buf)
  5832  	}
  5833  
  5834  	// Check for compression.
  5835  	if buf, err = mb.decompressIfNeeded(buf); err != nil {
  5836  		return err
  5837  	}
  5838  
  5839  	if err := mb.indexCacheBuf(buf); err != nil {
  5840  		if err == errCorruptState {
  5841  			var ld *LostStreamData
  5842  			if ld, _, err = mb.rebuildStateLocked(); ld != nil {
  5843  				// We do not know if fs is locked or not at this point.
  5844  				// This should be an exceptional condition so do so in Go routine.
  5845  				go mb.fs.rebuildState(ld)
  5846  			}
  5847  		}
  5848  		if err != nil {
  5849  			return err
  5850  		}
  5851  		goto checkCache
  5852  	}
  5853  
  5854  	if len(buf) > 0 {
  5855  		mb.cloads++
  5856  		mb.startCacheExpireTimer()
  5857  	}
  5858  
  5859  	return nil
  5860  }
  5861  
  5862  // Fetch a message from this block, possibly reading in and caching the messages.
  5863  // We assume the block was selected and is correct, so we do not do range checks.
  5864  func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
  5865  	mb.mu.Lock()
  5866  	defer mb.mu.Unlock()
  5867  
  5868  	fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  5869  	if seq < fseq || seq > lseq {
  5870  		return nil, false, ErrStoreMsgNotFound
  5871  	}
  5872  
  5873  	// See if we can short circuit if we already know msg deleted.
  5874  	if mb.dmap.Exists(seq) {
  5875  		// Update for scanning like cacheLookup would have.
  5876  		llseq := mb.llseq
  5877  		if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 {
  5878  			mb.llseq = seq
  5879  		}
  5880  		expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1)
  5881  		return nil, expireOk, errDeletedMsg
  5882  	}
  5883  
  5884  	if mb.cacheNotLoaded() {
  5885  		if err := mb.loadMsgsWithLock(); err != nil {
  5886  			return nil, false, err
  5887  		}
  5888  	}
  5889  	llseq := mb.llseq
  5890  
  5891  	fsm, err := mb.cacheLookup(seq, sm)
  5892  	if err != nil {
  5893  		return nil, false, err
  5894  	}
  5895  	expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1)
  5896  	return fsm, expireOk, err
  5897  }
  5898  
  5899  var (
  5900  	errNoCache       = errors.New("no message cache")
  5901  	errBadMsg        = errors.New("malformed or corrupt message")
  5902  	errDeletedMsg    = errors.New("deleted message")
  5903  	errPartialCache  = errors.New("partial cache")
  5904  	errNoPending     = errors.New("message block does not have pending data")
  5905  	errNotReadable   = errors.New("storage directory not readable")
  5906  	errCorruptState  = errors.New("corrupt state file")
  5907  	errPriorState    = errors.New("prior state file")
  5908  	errPendingData   = errors.New("pending data still present")
  5909  	errNoEncryption  = errors.New("encryption not enabled")
  5910  	errBadKeySize    = errors.New("encryption bad key size")
  5911  	errNoMsgBlk      = errors.New("no message block")
  5912  	errMsgBlkTooBig  = errors.New("message block size exceeded int capacity")
  5913  	errUnknownCipher = errors.New("unknown cipher")
  5914  	errNoMainKey     = errors.New("encrypted store encountered with no main key")
  5915  	errNoBlkData     = errors.New("message block data missing")
  5916  	errStateTooBig   = errors.New("store state too big for optional write")
  5917  )
  5918  
  5919  const (
  5920  	// Used for marking messages that have had their checksums checked.
  5921  	// Used to signal a message record with headers.
  5922  	hbit = 1 << 31
  5923  	// Used for marking erased messages sequences.
  5924  	ebit = 1 << 63
  5925  	// Used for marking tombstone sequences.
  5926  	tbit = 1 << 62
  5927  	// Used to mark an index as deleted and non-existent.
  5928  	dbit = 1 << 30
  5929  )
  5930  
  5931  // Will do a lookup from cache.
  5932  // Lock should be held.
  5933  func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
  5934  	if seq < atomic.LoadUint64(&mb.first.seq) || seq > atomic.LoadUint64(&mb.last.seq) {
  5935  		return nil, ErrStoreMsgNotFound
  5936  	}
  5937  
  5938  	// The llseq signals us when we can expire a cache at the end of a linear scan.
  5939  	// We want to only update when we know the last reads (multiple consumers) are sequential.
  5940  	// We want to account for forwards and backwards linear scans.
  5941  	if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 {
  5942  		mb.llseq = seq
  5943  	}
  5944  
  5945  	// If we have a delete map check it.
  5946  	if mb.dmap.Exists(seq) {
  5947  		mb.llts = time.Now().UnixNano()
  5948  		return nil, errDeletedMsg
  5949  	}
  5950  
  5951  	// Detect no cache loaded.
  5952  	if mb.cache == nil || mb.cache.fseq == 0 || len(mb.cache.idx) == 0 || len(mb.cache.buf) == 0 {
  5953  		var reason string
  5954  		if mb.cache == nil {
  5955  			reason = "no cache"
  5956  		} else if mb.cache.fseq == 0 {
  5957  			reason = "fseq is 0"
  5958  		} else if len(mb.cache.idx) == 0 {
  5959  			reason = "no idx present"
  5960  		} else {
  5961  			reason = "cache buf empty"
  5962  		}
  5963  		mb.fs.warn("Cache lookup detected no cache: %s", reason)
  5964  		return nil, errNoCache
  5965  	}
  5966  	// Check partial cache status.
  5967  	if seq < mb.cache.fseq {
  5968  		mb.fs.warn("Cache lookup detected partial cache: seq %d vs cache fseq %d", seq, mb.cache.fseq)
  5969  		return nil, errPartialCache
  5970  	}
  5971  
  5972  	bi, _, hashChecked, err := mb.slotInfo(int(seq - mb.cache.fseq))
  5973  	if err != nil {
  5974  		return nil, err
  5975  	}
  5976  
  5977  	// Update cache activity.
  5978  	mb.llts = time.Now().UnixNano()
  5979  
  5980  	li := int(bi) - mb.cache.off
  5981  	if li >= len(mb.cache.buf) {
  5982  		return nil, errPartialCache
  5983  	}
  5984  	buf := mb.cache.buf[li:]
  5985  
  5986  	// We use the high bit to denote we have already checked the checksum.
  5987  	var hh hash.Hash64
  5988  	if !hashChecked {
  5989  		hh = mb.hh // This will force the hash check in msgFromBuf.
  5990  	}
  5991  
  5992  	// Parse from the raw buffer.
  5993  	fsm, err := mb.msgFromBuf(buf, sm, hh)
  5994  	if err != nil || fsm == nil {
  5995  		return nil, err
  5996  	}
  5997  
  5998  	// Deleted messages that are decoded return a 0 for sequence.
  5999  	if fsm.seq == 0 {
  6000  		return nil, errDeletedMsg
  6001  	}
  6002  
  6003  	if seq != fsm.seq {
  6004  		recycleMsgBlockBuf(mb.cache.buf)
  6005  		mb.cache.buf = nil
  6006  		return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq)
  6007  	}
  6008  
  6009  	// Clear the check bit here after we know all is good.
  6010  	if !hashChecked {
  6011  		mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit)
  6012  	}
  6013  
  6014  	return fsm, nil
  6015  }
  6016  
  6017  // Used when we are checking if discarding a message due to max msgs per subject will give us
  6018  // enough room for a max bytes condition.
  6019  // Lock should be already held.
  6020  func (fs *fileStore) sizeForSeq(seq uint64) int {
  6021  	if seq == 0 {
  6022  		return 0
  6023  	}
  6024  	var smv StoreMsg
  6025  	if mb := fs.selectMsgBlock(seq); mb != nil {
  6026  		if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil {
  6027  			return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg))
  6028  		}
  6029  	}
  6030  	return 0
  6031  }
  6032  
  6033  // Will return message for the given sequence number.
  6034  func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
  6035  	// TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will
  6036  	// be stalled. Need another lock if want to happen in parallel.
  6037  	fs.mu.RLock()
  6038  	if fs.closed {
  6039  		fs.mu.RUnlock()
  6040  		return nil, ErrStoreClosed
  6041  	}
  6042  	// Indicates we want first msg.
  6043  	if seq == 0 {
  6044  		seq = fs.state.FirstSeq
  6045  	}
  6046  	// Make sure to snapshot here.
  6047  	mb, lseq := fs.selectMsgBlock(seq), fs.state.LastSeq
  6048  	fs.mu.RUnlock()
  6049  
  6050  	if mb == nil {
  6051  		var err = ErrStoreEOF
  6052  		if seq <= lseq {
  6053  			err = ErrStoreMsgNotFound
  6054  		}
  6055  		return nil, err
  6056  	}
  6057  
  6058  	fsm, expireOk, err := mb.fetchMsg(seq, sm)
  6059  	if err != nil {
  6060  		return nil, err
  6061  	}
  6062  
  6063  	// We detected a linear scan and access to the last message.
  6064  	// If we are not the last message block we can try to expire the cache.
  6065  	if expireOk {
  6066  		mb.tryForceExpireCache()
  6067  	}
  6068  
  6069  	return fsm, nil
  6070  }
  6071  
  6072  // Internal function to return msg parts from a raw buffer.
  6073  // Lock should be held.
  6074  func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) {
  6075  	if len(buf) < emptyRecordLen {
  6076  		return nil, errBadMsg
  6077  	}
  6078  	var le = binary.LittleEndian
  6079  
  6080  	hdr := buf[:msgHdrSize]
  6081  	rl := le.Uint32(hdr[0:])
  6082  	hasHeaders := rl&hbit != 0
  6083  	rl &^= hbit // clear header bit
  6084  	dlen := int(rl) - msgHdrSize
  6085  	slen := int(le.Uint16(hdr[20:]))
  6086  	// Simple sanity check.
  6087  	if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || int(rl) > len(buf) {
  6088  		return nil, errBadMsg
  6089  	}
  6090  	data := buf[msgHdrSize : msgHdrSize+dlen]
  6091  	// Do checksum tests here if requested.
  6092  	if hh != nil {
  6093  		hh.Reset()
  6094  		hh.Write(hdr[4:20])
  6095  		hh.Write(data[:slen])
  6096  		if hasHeaders {
  6097  			hh.Write(data[slen+4 : dlen-recordHashSize])
  6098  		} else {
  6099  			hh.Write(data[slen : dlen-recordHashSize])
  6100  		}
  6101  		if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) {
  6102  			return nil, errBadMsg
  6103  		}
  6104  	}
  6105  	seq := le.Uint64(hdr[4:])
  6106  	if seq&ebit != 0 {
  6107  		seq = 0
  6108  	}
  6109  	ts := int64(le.Uint64(hdr[12:]))
  6110  
  6111  	// Create a StoreMsg if needed.
  6112  	if sm == nil {
  6113  		sm = new(StoreMsg)
  6114  	} else {
  6115  		sm.clear()
  6116  	}
  6117  	// To recycle the large blocks we can never pass back a reference, so need to copy for the upper
  6118  	// layers and for us to be safe to expire, and recycle, the large msgBlocks.
  6119  	end := dlen - 8
  6120  
  6121  	if hasHeaders {
  6122  		hl := le.Uint32(data[slen:])
  6123  		bi := slen + 4
  6124  		li := bi + int(hl)
  6125  		sm.buf = append(sm.buf, data[bi:end]...)
  6126  		li, end = li-bi, end-bi
  6127  		sm.hdr = sm.buf[0:li:li]
  6128  		sm.msg = sm.buf[li:end]
  6129  	} else {
  6130  		sm.buf = append(sm.buf, data[slen:end]...)
  6131  		sm.msg = sm.buf[0 : end-slen]
  6132  	}
  6133  	sm.seq, sm.ts = seq, ts
  6134  	if slen > 0 {
  6135  		// Make a copy since sm.subj lifetime may last longer.
  6136  		sm.subj = string(data[:slen])
  6137  	}
  6138  
  6139  	return sm, nil
  6140  }
  6141  
  6142  // LoadMsg will lookup the message by sequence number and return it if found.
  6143  func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
  6144  	return fs.msgForSeq(seq, sm)
  6145  }
  6146  
  6147  // loadLast will load the last message for a subject. Subject should be non empty and not ">".
  6148  func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) {
  6149  	fs.mu.RLock()
  6150  	defer fs.mu.RUnlock()
  6151  
  6152  	if fs.closed || fs.lmb == nil {
  6153  		return nil, ErrStoreClosed
  6154  	}
  6155  
  6156  	if len(fs.blks) == 0 {
  6157  		return nil, ErrStoreMsgNotFound
  6158  	}
  6159  
  6160  	start, stop := fs.lmb.index, fs.blks[0].index
  6161  	wc := subjectHasWildcard(subj)
  6162  	// If literal subject check for presence.
  6163  	if !wc {
  6164  		if info, ok := fs.psim.Find(stringToBytes(subj)); !ok {
  6165  			return nil, ErrStoreMsgNotFound
  6166  		} else {
  6167  			start, stop = info.lblk, info.fblk
  6168  		}
  6169  	}
  6170  
  6171  	// Walk blocks backwards.
  6172  	for i := start; i >= stop; i-- {
  6173  		mb := fs.bim[i]
  6174  		if mb == nil {
  6175  			continue
  6176  		}
  6177  		mb.mu.Lock()
  6178  		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
  6179  			mb.mu.Unlock()
  6180  			return nil, err
  6181  		}
  6182  		var l uint64
  6183  		// Optimize if subject is not a wildcard.
  6184  		if !wc {
  6185  			if ss := mb.fss[subj]; ss != nil {
  6186  				l = ss.Last
  6187  			}
  6188  		}
  6189  		if l == 0 {
  6190  			_, _, l = mb.filteredPendingLocked(subj, wc, atomic.LoadUint64(&mb.first.seq))
  6191  		}
  6192  		if l > 0 {
  6193  			if mb.cacheNotLoaded() {
  6194  				if err := mb.loadMsgsWithLock(); err != nil {
  6195  					mb.mu.Unlock()
  6196  					return nil, err
  6197  				}
  6198  			}
  6199  			lsm, err = mb.cacheLookup(l, sm)
  6200  		}
  6201  		mb.mu.Unlock()
  6202  		if l > 0 {
  6203  			break
  6204  		}
  6205  	}
  6206  	return lsm, err
  6207  }
  6208  
  6209  // LoadLastMsg will return the last message we have that matches a given subject.
  6210  // The subject can be a wildcard.
  6211  func (fs *fileStore) LoadLastMsg(subject string, smv *StoreMsg) (sm *StoreMsg, err error) {
  6212  	if subject == _EMPTY_ || subject == fwcs {
  6213  		sm, err = fs.msgForSeq(fs.lastSeq(), smv)
  6214  	} else {
  6215  		sm, err = fs.loadLast(subject, smv)
  6216  	}
  6217  	if sm == nil || (err != nil && err != ErrStoreClosed) {
  6218  		err = ErrStoreMsgNotFound
  6219  	}
  6220  	return sm, err
  6221  }
  6222  
  6223  // LoadNextMsgMulti will find the next message matching any entry in the sublist.
  6224  func (fs *fileStore) LoadNextMsgMulti(sl *Sublist, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) {
  6225  	if sl == nil {
  6226  		return fs.LoadNextMsg(_EMPTY_, false, start, smp)
  6227  	}
  6228  	fs.mu.RLock()
  6229  	defer fs.mu.RUnlock()
  6230  
  6231  	if fs.closed {
  6232  		return nil, 0, ErrStoreClosed
  6233  	}
  6234  	if fs.state.Msgs == 0 {
  6235  		return nil, fs.state.LastSeq, ErrStoreEOF
  6236  	}
  6237  	if start < fs.state.FirstSeq {
  6238  		start = fs.state.FirstSeq
  6239  	}
  6240  
  6241  	if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 {
  6242  		for i := bi; i < len(fs.blks); i++ {
  6243  			mb := fs.blks[i]
  6244  			if sm, expireOk, err := mb.firstMatchingMulti(sl, start, smp); err == nil {
  6245  				if expireOk {
  6246  					mb.tryForceExpireCache()
  6247  				}
  6248  				return sm, sm.seq, nil
  6249  			} else if err != ErrStoreMsgNotFound {
  6250  				return nil, 0, err
  6251  			} else if expireOk {
  6252  				mb.tryForceExpireCache()
  6253  			}
  6254  		}
  6255  	}
  6256  
  6257  	return nil, fs.state.LastSeq, ErrStoreEOF
  6258  
  6259  }
  6260  
  6261  func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) {
  6262  	fs.mu.RLock()
  6263  	defer fs.mu.RUnlock()
  6264  
  6265  	if fs.closed {
  6266  		return nil, 0, ErrStoreClosed
  6267  	}
  6268  	if fs.state.Msgs == 0 {
  6269  		return nil, fs.state.LastSeq, ErrStoreEOF
  6270  	}
  6271  	if start < fs.state.FirstSeq {
  6272  		start = fs.state.FirstSeq
  6273  	}
  6274  
  6275  	// If start is less than or equal to beginning of our stream, meaning our first call,
  6276  	// let's check the psim to see if we can skip ahead.
  6277  	if start <= fs.state.FirstSeq {
  6278  		var ss SimpleState
  6279  		fs.numFilteredPending(filter, &ss)
  6280  		if ss.First > start {
  6281  			start = ss.First
  6282  		}
  6283  	}
  6284  
  6285  	if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 {
  6286  		for i := bi; i < len(fs.blks); i++ {
  6287  			mb := fs.blks[i]
  6288  			if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil {
  6289  				if expireOk {
  6290  					mb.tryForceExpireCache()
  6291  				}
  6292  				return sm, sm.seq, nil
  6293  			} else if err != ErrStoreMsgNotFound {
  6294  				return nil, 0, err
  6295  			} else if expireOk {
  6296  				mb.tryForceExpireCache()
  6297  			}
  6298  		}
  6299  	}
  6300  
  6301  	return nil, fs.state.LastSeq, ErrStoreEOF
  6302  }
  6303  
  6304  // Type returns the type of the underlying store.
  6305  func (fs *fileStore) Type() StorageType {
  6306  	return FileStorage
  6307  }
  6308  
  6309  // Returns number of subjects in this store.
  6310  // Lock should be held.
  6311  func (fs *fileStore) numSubjects() int {
  6312  	return fs.psim.Size()
  6313  }
  6314  
  6315  // numConsumers uses new lock.
  6316  func (fs *fileStore) numConsumers() int {
  6317  	fs.cmu.RLock()
  6318  	defer fs.cmu.RUnlock()
  6319  	return len(fs.cfs)
  6320  }
  6321  
  6322  // FastState will fill in state with only the following.
  6323  // Msgs, Bytes, First and Last Sequence and Time and NumDeleted.
  6324  func (fs *fileStore) FastState(state *StreamState) {
  6325  	fs.mu.RLock()
  6326  	state.Msgs = fs.state.Msgs
  6327  	state.Bytes = fs.state.Bytes
  6328  	state.FirstSeq = fs.state.FirstSeq
  6329  	state.FirstTime = fs.state.FirstTime
  6330  	state.LastSeq = fs.state.LastSeq
  6331  	state.LastTime = fs.state.LastTime
  6332  	// Make sure to reset if being re-used.
  6333  	state.Deleted, state.NumDeleted = nil, 0
  6334  	if state.LastSeq > state.FirstSeq {
  6335  		state.NumDeleted = int((state.LastSeq - state.FirstSeq + 1) - state.Msgs)
  6336  		if state.NumDeleted < 0 {
  6337  			state.NumDeleted = 0
  6338  		}
  6339  	}
  6340  	state.Consumers = fs.numConsumers()
  6341  	state.NumSubjects = fs.numSubjects()
  6342  	fs.mu.RUnlock()
  6343  }
  6344  
  6345  // State returns the current state of the stream.
  6346  func (fs *fileStore) State() StreamState {
  6347  	fs.mu.RLock()
  6348  	state := fs.state
  6349  	state.Consumers = fs.numConsumers()
  6350  	state.NumSubjects = fs.numSubjects()
  6351  	state.Deleted = nil // make sure.
  6352  
  6353  	if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 {
  6354  		state.Deleted = make([]uint64, 0, numDeleted)
  6355  		cur := fs.state.FirstSeq
  6356  
  6357  		for _, mb := range fs.blks {
  6358  			mb.mu.Lock()
  6359  			fseq := atomic.LoadUint64(&mb.first.seq)
  6360  			// Account for messages missing from the head.
  6361  			if fseq > cur {
  6362  				for seq := cur; seq < fseq; seq++ {
  6363  					state.Deleted = append(state.Deleted, seq)
  6364  				}
  6365  			}
  6366  			cur = atomic.LoadUint64(&mb.last.seq) + 1 // Expected next first.
  6367  
  6368  			mb.dmap.Range(func(seq uint64) bool {
  6369  				if seq < fseq {
  6370  					mb.dmap.Delete(seq)
  6371  				} else {
  6372  					state.Deleted = append(state.Deleted, seq)
  6373  				}
  6374  				return true
  6375  			})
  6376  			mb.mu.Unlock()
  6377  		}
  6378  	}
  6379  	fs.mu.RUnlock()
  6380  
  6381  	state.Lost = fs.lostData()
  6382  
  6383  	// Can not be guaranteed to be sorted.
  6384  	if len(state.Deleted) > 0 {
  6385  		sort.Slice(state.Deleted, func(i, j int) bool {
  6386  			return state.Deleted[i] < state.Deleted[j]
  6387  		})
  6388  		state.NumDeleted = len(state.Deleted)
  6389  	}
  6390  	return state
  6391  }
  6392  
  6393  func (fs *fileStore) Utilization() (total, reported uint64, err error) {
  6394  	fs.mu.RLock()
  6395  	defer fs.mu.RUnlock()
  6396  	for _, mb := range fs.blks {
  6397  		mb.mu.RLock()
  6398  		reported += mb.bytes
  6399  		total += mb.rbytes
  6400  		mb.mu.RUnlock()
  6401  	}
  6402  	return total, reported, nil
  6403  }
  6404  
  6405  func fileStoreMsgSize(subj string, hdr, msg []byte) uint64 {
  6406  	if len(hdr) == 0 {
  6407  		// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + msg + hash(8)
  6408  		return uint64(22 + len(subj) + len(msg) + 8)
  6409  	}
  6410  	// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + hdr_len(4) + hdr + msg + hash(8)
  6411  	return uint64(22 + len(subj) + 4 + len(hdr) + len(msg) + 8)
  6412  }
  6413  
  6414  func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 {
  6415  	return uint64(emptyRecordLen + slen + 4 + maxPayload)
  6416  }
  6417  
  6418  // Determine time since any last activity, read/load, write or remove.
  6419  func (mb *msgBlock) sinceLastActivity() time.Duration {
  6420  	if mb.closed {
  6421  		return 0
  6422  	}
  6423  	last := mb.lwts
  6424  	if mb.lrts > last {
  6425  		last = mb.lrts
  6426  	}
  6427  	if mb.llts > last {
  6428  		last = mb.llts
  6429  	}
  6430  	return time.Since(time.Unix(0, last).UTC())
  6431  }
  6432  
  6433  // Determine time since last write or remove of a message.
  6434  // Read lock should be held.
  6435  func (mb *msgBlock) sinceLastWriteActivity() time.Duration {
  6436  	if mb.closed {
  6437  		return 0
  6438  	}
  6439  	last := mb.lwts
  6440  	if mb.lrts > last {
  6441  		last = mb.lrts
  6442  	}
  6443  	return time.Since(time.Unix(0, last).UTC())
  6444  }
  6445  
  6446  func checkNewHeader(hdr []byte) error {
  6447  	if hdr == nil || len(hdr) < 2 || hdr[0] != magic ||
  6448  		(hdr[1] != version && hdr[1] != newVersion) {
  6449  		return errCorruptState
  6450  	}
  6451  	return nil
  6452  }
  6453  
  6454  // readIndexInfo will read in the index information for the message block.
  6455  func (mb *msgBlock) readIndexInfo() error {
  6456  	ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index))
  6457  	buf, err := os.ReadFile(ifn)
  6458  	if err != nil {
  6459  		return err
  6460  	}
  6461  
  6462  	// Set if first time.
  6463  	if mb.liwsz == 0 {
  6464  		mb.liwsz = int64(len(buf))
  6465  	}
  6466  
  6467  	// Decrypt if needed.
  6468  	if mb.aek != nil {
  6469  		buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil)
  6470  		if err != nil {
  6471  			return err
  6472  		}
  6473  	}
  6474  
  6475  	if err := checkNewHeader(buf); err != nil {
  6476  		defer os.Remove(ifn)
  6477  		return fmt.Errorf("bad index file")
  6478  	}
  6479  
  6480  	bi := hdrLen
  6481  
  6482  	// Helpers, will set i to -1 on error.
  6483  	readSeq := func() uint64 {
  6484  		if bi < 0 {
  6485  			return 0
  6486  		}
  6487  		seq, n := binary.Uvarint(buf[bi:])
  6488  		if n <= 0 {
  6489  			bi = -1
  6490  			return 0
  6491  		}
  6492  		bi += n
  6493  		return seq &^ ebit
  6494  	}
  6495  	readCount := readSeq
  6496  	readTimeStamp := func() int64 {
  6497  		if bi < 0 {
  6498  			return 0
  6499  		}
  6500  		ts, n := binary.Varint(buf[bi:])
  6501  		if n <= 0 {
  6502  			bi = -1
  6503  			return -1
  6504  		}
  6505  		bi += n
  6506  		return ts
  6507  	}
  6508  	mb.msgs = readCount()
  6509  	mb.bytes = readCount()
  6510  	atomic.StoreUint64(&mb.first.seq, readSeq())
  6511  	mb.first.ts = readTimeStamp()
  6512  	atomic.StoreUint64(&mb.last.seq, readSeq())
  6513  	mb.last.ts = readTimeStamp()
  6514  	dmapLen := readCount()
  6515  
  6516  	// Check if this is a short write index file.
  6517  	if bi < 0 || bi+checksumSize > len(buf) {
  6518  		os.Remove(ifn)
  6519  		return fmt.Errorf("short index file")
  6520  	}
  6521  
  6522  	// Check for consistency if accounting. If something is off bail and we will rebuild.
  6523  	if mb.msgs != (atomic.LoadUint64(&mb.last.seq)-atomic.LoadUint64(&mb.first.seq)+1)-dmapLen {
  6524  		os.Remove(ifn)
  6525  		return fmt.Errorf("accounting inconsistent")
  6526  	}
  6527  
  6528  	// Checksum
  6529  	copy(mb.lchk[0:], buf[bi:bi+checksumSize])
  6530  	bi += checksumSize
  6531  
  6532  	// Now check for presence of a delete map
  6533  	if dmapLen > 0 {
  6534  		// New version is encoded avl seqset.
  6535  		if buf[1] == newVersion {
  6536  			dmap, _, err := avl.Decode(buf[bi:])
  6537  			if err != nil {
  6538  				return fmt.Errorf("could not decode avl dmap: %v", err)
  6539  			}
  6540  			mb.dmap = *dmap
  6541  		} else {
  6542  			// This is the old version.
  6543  			for i, fseq := 0, atomic.LoadUint64(&mb.first.seq); i < int(dmapLen); i++ {
  6544  				seq := readSeq()
  6545  				if seq == 0 {
  6546  					break
  6547  				}
  6548  				mb.dmap.Insert(seq + fseq)
  6549  			}
  6550  		}
  6551  	}
  6552  
  6553  	return nil
  6554  }
  6555  
  6556  // Will return total number of cache loads.
  6557  func (fs *fileStore) cacheLoads() uint64 {
  6558  	var tl uint64
  6559  	fs.mu.RLock()
  6560  	for _, mb := range fs.blks {
  6561  		tl += mb.cloads
  6562  	}
  6563  	fs.mu.RUnlock()
  6564  	return tl
  6565  }
  6566  
  6567  // Will return total number of cached bytes.
  6568  func (fs *fileStore) cacheSize() uint64 {
  6569  	var sz uint64
  6570  	fs.mu.RLock()
  6571  	for _, mb := range fs.blks {
  6572  		mb.mu.RLock()
  6573  		if mb.cache != nil {
  6574  			sz += uint64(len(mb.cache.buf))
  6575  		}
  6576  		mb.mu.RUnlock()
  6577  	}
  6578  	fs.mu.RUnlock()
  6579  	return sz
  6580  }
  6581  
  6582  // Will return total number of dmapEntries for all msg blocks.
  6583  func (fs *fileStore) dmapEntries() int {
  6584  	var total int
  6585  	fs.mu.RLock()
  6586  	for _, mb := range fs.blks {
  6587  		total += mb.dmap.Size()
  6588  	}
  6589  	fs.mu.RUnlock()
  6590  	return total
  6591  }
  6592  
  6593  // Fixed helper for iterating.
  6594  func subjectsEqual(a, b string) bool {
  6595  	return a == b
  6596  }
  6597  
  6598  func subjectsAll(a, b string) bool {
  6599  	return true
  6600  }
  6601  
  6602  func compareFn(subject string) func(string, string) bool {
  6603  	if subject == _EMPTY_ || subject == fwcs {
  6604  		return subjectsAll
  6605  	}
  6606  	if subjectHasWildcard(subject) {
  6607  		return subjectIsSubsetMatch
  6608  	}
  6609  	return subjectsEqual
  6610  }
  6611  
  6612  // PurgeEx will remove messages based on subject filters, sequence and number of messages to keep.
  6613  // Will return the number of purged messages.
  6614  func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) {
  6615  	if subject == _EMPTY_ || subject == fwcs {
  6616  		if keep == 0 && sequence == 0 {
  6617  			return fs.Purge()
  6618  		}
  6619  		if sequence > 1 {
  6620  			return fs.Compact(sequence)
  6621  		}
  6622  	}
  6623  
  6624  	eq, wc := compareFn(subject), subjectHasWildcard(subject)
  6625  	var firstSeqNeedsUpdate bool
  6626  	var bytes uint64
  6627  
  6628  	// If we have a "keep" designation need to get full filtered state so we know how many to purge.
  6629  	var maxp uint64
  6630  	if keep > 0 {
  6631  		ss := fs.FilteredState(1, subject)
  6632  		if keep >= ss.Msgs {
  6633  			return 0, nil
  6634  		}
  6635  		maxp = ss.Msgs - keep
  6636  	}
  6637  
  6638  	var smv StoreMsg
  6639  
  6640  	fs.mu.Lock()
  6641  	// We may remove blocks as we purge, so don't range directly on fs.blks
  6642  	// otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528)
  6643  	for i := 0; i < len(fs.blks); i++ {
  6644  		mb := fs.blks[i]
  6645  		mb.mu.Lock()
  6646  
  6647  		// If we do not have our fss, try to expire the cache if we have no items in this block.
  6648  		shouldExpire := mb.fssNotLoaded()
  6649  
  6650  		t, f, l := mb.filteredPendingLocked(subject, wc, atomic.LoadUint64(&mb.first.seq))
  6651  		if t == 0 {
  6652  			// Expire if we were responsible for loading.
  6653  			if shouldExpire {
  6654  				// Expire this cache before moving on.
  6655  				mb.tryForceExpireCacheLocked()
  6656  			}
  6657  			mb.mu.Unlock()
  6658  			continue
  6659  		}
  6660  
  6661  		if sequence > 1 && sequence <= l {
  6662  			l = sequence - 1
  6663  		}
  6664  
  6665  		if mb.cacheNotLoaded() {
  6666  			mb.loadMsgsWithLock()
  6667  			shouldExpire = true
  6668  		}
  6669  
  6670  		for seq := f; seq <= l; seq++ {
  6671  			if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) {
  6672  				rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  6673  				// Do fast in place remove.
  6674  				// Stats
  6675  				if mb.msgs > 0 {
  6676  					// Msgs
  6677  					fs.state.Msgs--
  6678  					mb.msgs--
  6679  					// Bytes, make sure to not go negative.
  6680  					if rl > fs.state.Bytes {
  6681  						rl = fs.state.Bytes
  6682  					}
  6683  					if rl > mb.bytes {
  6684  						rl = mb.bytes
  6685  					}
  6686  					fs.state.Bytes -= rl
  6687  					mb.bytes -= rl
  6688  					// Totals
  6689  					purged++
  6690  					bytes += rl
  6691  				}
  6692  				// FSS updates.
  6693  				mb.removeSeqPerSubject(sm.subj, seq)
  6694  				fs.removePerSubject(sm.subj)
  6695  
  6696  				// Check for first message.
  6697  				if seq == atomic.LoadUint64(&mb.first.seq) {
  6698  					mb.selectNextFirst()
  6699  					if mb.isEmpty() {
  6700  						fs.removeMsgBlock(mb)
  6701  						i--
  6702  						// keep flag set, if set previously
  6703  						firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq
  6704  					} else if seq == fs.state.FirstSeq {
  6705  						fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one.
  6706  						fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  6707  					}
  6708  				} else {
  6709  					// Out of order delete.
  6710  					mb.dmap.Insert(seq)
  6711  				}
  6712  
  6713  				if maxp > 0 && purged >= maxp {
  6714  					break
  6715  				}
  6716  			}
  6717  		}
  6718  		// Expire if we were responsible for loading.
  6719  		if shouldExpire {
  6720  			// Expire this cache before moving on.
  6721  			mb.tryForceExpireCacheLocked()
  6722  		}
  6723  		mb.mu.Unlock()
  6724  
  6725  		// Check if we should break out of top level too.
  6726  		if maxp > 0 && purged >= maxp {
  6727  			break
  6728  		}
  6729  	}
  6730  	if firstSeqNeedsUpdate {
  6731  		fs.selectNextFirst()
  6732  	}
  6733  
  6734  	fs.dirty++
  6735  	cb := fs.scb
  6736  	fs.mu.Unlock()
  6737  
  6738  	if cb != nil {
  6739  		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
  6740  	}
  6741  
  6742  	return purged, nil
  6743  }
  6744  
  6745  // Purge will remove all messages from this store.
  6746  // Will return the number of purged messages.
  6747  func (fs *fileStore) Purge() (uint64, error) {
  6748  	return fs.purge(0)
  6749  }
  6750  
  6751  func (fs *fileStore) purge(fseq uint64) (uint64, error) {
  6752  	fs.mu.Lock()
  6753  	if fs.closed {
  6754  		fs.mu.Unlock()
  6755  		return 0, ErrStoreClosed
  6756  	}
  6757  
  6758  	purged := fs.state.Msgs
  6759  	rbytes := int64(fs.state.Bytes)
  6760  
  6761  	fs.state.FirstSeq = fs.state.LastSeq + 1
  6762  	fs.state.FirstTime = time.Time{}
  6763  
  6764  	fs.state.Bytes = 0
  6765  	fs.state.Msgs = 0
  6766  
  6767  	for _, mb := range fs.blks {
  6768  		mb.dirtyClose()
  6769  	}
  6770  
  6771  	fs.blks = nil
  6772  	fs.lmb = nil
  6773  	fs.bim = make(map[uint32]*msgBlock)
  6774  	// Clear any per subject tracking.
  6775  	fs.psim, fs.tsl = fs.psim.Empty(), 0
  6776  	// Mark dirty
  6777  	fs.dirty++
  6778  
  6779  	// Move the msgs directory out of the way, will delete out of band.
  6780  	// FIXME(dlc) - These can error and we need to change api above to propagate?
  6781  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  6782  	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
  6783  	// If purge directory still exists then we need to wait
  6784  	// in place and remove since rename would fail.
  6785  	if _, err := os.Stat(pdir); err == nil {
  6786  		<-dios
  6787  		os.RemoveAll(pdir)
  6788  		dios <- struct{}{}
  6789  	}
  6790  
  6791  	<-dios
  6792  	os.Rename(mdir, pdir)
  6793  	dios <- struct{}{}
  6794  
  6795  	go func() {
  6796  		<-dios
  6797  		os.RemoveAll(pdir)
  6798  		dios <- struct{}{}
  6799  	}()
  6800  
  6801  	// Create new one.
  6802  	<-dios
  6803  	os.MkdirAll(mdir, defaultDirPerms)
  6804  	dios <- struct{}{}
  6805  
  6806  	// Make sure we have a lmb to write to.
  6807  	if _, err := fs.newMsgBlockForWrite(); err != nil {
  6808  		fs.mu.Unlock()
  6809  		return purged, err
  6810  	}
  6811  
  6812  	// Check if we need to set the first seq to a new number.
  6813  	if fseq > fs.state.FirstSeq {
  6814  		fs.state.FirstSeq = fseq
  6815  		fs.state.LastSeq = fseq - 1
  6816  	}
  6817  
  6818  	lmb := fs.lmb
  6819  	atomic.StoreUint64(&lmb.first.seq, fs.state.FirstSeq)
  6820  	atomic.StoreUint64(&lmb.last.seq, fs.state.LastSeq)
  6821  	lmb.last.ts = fs.state.LastTime.UnixNano()
  6822  
  6823  	if lseq := atomic.LoadUint64(&lmb.last.seq); lseq > 1 {
  6824  		// Leave a tombstone so we can remember our starting sequence in case
  6825  		// full state becomes corrupted.
  6826  		lmb.writeTombstone(lseq, lmb.last.ts)
  6827  	}
  6828  
  6829  	cb := fs.scb
  6830  	fs.mu.Unlock()
  6831  
  6832  	if cb != nil {
  6833  		cb(-int64(purged), -rbytes, 0, _EMPTY_)
  6834  	}
  6835  
  6836  	return purged, nil
  6837  }
  6838  
  6839  // Compact will remove all messages from this store up to
  6840  // but not including the seq parameter.
  6841  // Will return the number of purged messages.
  6842  func (fs *fileStore) Compact(seq uint64) (uint64, error) {
  6843  	if seq == 0 {
  6844  		return fs.purge(seq)
  6845  	}
  6846  
  6847  	var purged, bytes uint64
  6848  
  6849  	fs.mu.Lock()
  6850  	// Same as purge all.
  6851  	if lseq := fs.state.LastSeq; seq > lseq {
  6852  		fs.mu.Unlock()
  6853  		return fs.purge(seq)
  6854  	}
  6855  	// We have to delete interior messages.
  6856  	smb := fs.selectMsgBlock(seq)
  6857  	if smb == nil {
  6858  		fs.mu.Unlock()
  6859  		return 0, nil
  6860  	}
  6861  
  6862  	// All msgblocks up to this one can be thrown away.
  6863  	var deleted int
  6864  	for _, mb := range fs.blks {
  6865  		if mb == smb {
  6866  			break
  6867  		}
  6868  		mb.mu.Lock()
  6869  		purged += mb.msgs
  6870  		bytes += mb.bytes
  6871  		// Make sure we do subject cleanup as well.
  6872  		mb.ensurePerSubjectInfoLoaded()
  6873  		for subj, ss := range mb.fss {
  6874  			for i := uint64(0); i < ss.Msgs; i++ {
  6875  				fs.removePerSubject(subj)
  6876  			}
  6877  		}
  6878  		// Now close.
  6879  		mb.dirtyCloseWithRemove(true)
  6880  		mb.mu.Unlock()
  6881  		deleted++
  6882  	}
  6883  
  6884  	var smv StoreMsg
  6885  	var err error
  6886  
  6887  	smb.mu.Lock()
  6888  	if atomic.LoadUint64(&smb.first.seq) == seq {
  6889  		fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq)
  6890  		fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC()
  6891  		goto SKIP
  6892  	}
  6893  
  6894  	// Make sure we have the messages loaded.
  6895  	if smb.cacheNotLoaded() {
  6896  		if err = smb.loadMsgsWithLock(); err != nil {
  6897  			goto SKIP
  6898  		}
  6899  	}
  6900  	for mseq := atomic.LoadUint64(&smb.first.seq); mseq < seq; mseq++ {
  6901  		sm, err := smb.cacheLookup(mseq, &smv)
  6902  		if err == errDeletedMsg {
  6903  			// Update dmap.
  6904  			if !smb.dmap.IsEmpty() {
  6905  				smb.dmap.Delete(seq)
  6906  			}
  6907  		} else if sm != nil {
  6908  			sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  6909  			if smb.msgs > 0 {
  6910  				smb.msgs--
  6911  				if sz > smb.bytes {
  6912  					sz = smb.bytes
  6913  				}
  6914  				smb.bytes -= sz
  6915  				bytes += sz
  6916  				purged++
  6917  			}
  6918  			// Update fss
  6919  			smb.removeSeqPerSubject(sm.subj, mseq)
  6920  			fs.removePerSubject(sm.subj)
  6921  		}
  6922  	}
  6923  
  6924  	// Check if empty after processing, could happen if tail of messages are all deleted.
  6925  	if isEmpty := smb.msgs == 0; isEmpty {
  6926  		// Only remove if not the last block.
  6927  		if smb != fs.lmb {
  6928  			smb.dirtyCloseWithRemove(true)
  6929  			deleted++
  6930  		}
  6931  		// Update fs first here as well.
  6932  		fs.state.FirstSeq = atomic.LoadUint64(&smb.last.seq) + 1
  6933  		fs.state.FirstTime = time.Time{}
  6934  
  6935  	} else {
  6936  		// Make sure to sync changes.
  6937  		smb.needSync = true
  6938  		// Update fs first seq and time.
  6939  		atomic.StoreUint64(&smb.first.seq, seq-1) // Just for start condition for selectNextFirst.
  6940  		smb.selectNextFirst()
  6941  
  6942  		fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq)
  6943  		fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC()
  6944  
  6945  		// Check if we should reclaim the head space from this block.
  6946  		// This will be optimistic only, so don't continue if we encounter any errors here.
  6947  		if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes {
  6948  			var moff uint32
  6949  			moff, _, _, err = smb.slotInfo(int(atomic.LoadUint64(&smb.first.seq) - smb.cache.fseq))
  6950  			if err != nil || moff >= uint32(len(smb.cache.buf)) {
  6951  				goto SKIP
  6952  			}
  6953  			buf := smb.cache.buf[moff:]
  6954  			// Don't reuse, copy to new recycled buf.
  6955  			nbuf := getMsgBlockBuf(len(buf))
  6956  			nbuf = append(nbuf, buf...)
  6957  			smb.closeFDsLockedNoCheck()
  6958  			// Check for encryption.
  6959  			if smb.bek != nil && len(nbuf) > 0 {
  6960  				// Recreate to reset counter.
  6961  				bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce)
  6962  				if err != nil {
  6963  					goto SKIP
  6964  				}
  6965  				// For future writes make sure to set smb.bek to keep counter correct.
  6966  				smb.bek = bek
  6967  				smb.bek.XORKeyStream(nbuf, nbuf)
  6968  			}
  6969  			// Recompress if necessary (smb.cmp contains the algorithm used when
  6970  			// the block was loaded from disk, or defaults to NoCompression if not)
  6971  			if nbuf, err = smb.cmp.Compress(nbuf); err != nil {
  6972  				goto SKIP
  6973  			}
  6974  			<-dios
  6975  			err = os.WriteFile(smb.mfn, nbuf, defaultFilePerms)
  6976  			dios <- struct{}{}
  6977  			if err != nil {
  6978  				goto SKIP
  6979  			}
  6980  			// Make sure to remove fss state.
  6981  			smb.fss = nil
  6982  			smb.clearCacheAndOffset()
  6983  			smb.rbytes = uint64(len(nbuf))
  6984  		}
  6985  	}
  6986  
  6987  SKIP:
  6988  	smb.mu.Unlock()
  6989  
  6990  	if deleted > 0 {
  6991  		// Update block map.
  6992  		if fs.bim != nil {
  6993  			for _, mb := range fs.blks[:deleted] {
  6994  				delete(fs.bim, mb.index)
  6995  			}
  6996  		}
  6997  		// Update blks slice.
  6998  		fs.blks = copyMsgBlocks(fs.blks[deleted:])
  6999  		if lb := len(fs.blks); lb == 0 {
  7000  			fs.lmb = nil
  7001  		} else {
  7002  			fs.lmb = fs.blks[lb-1]
  7003  		}
  7004  	}
  7005  
  7006  	// Update top level accounting.
  7007  	if purged > fs.state.Msgs {
  7008  		purged = fs.state.Msgs
  7009  	}
  7010  	fs.state.Msgs -= purged
  7011  
  7012  	if bytes > fs.state.Bytes {
  7013  		bytes = fs.state.Bytes
  7014  	}
  7015  	fs.state.Bytes -= bytes
  7016  
  7017  	fs.dirty++
  7018  
  7019  	cb := fs.scb
  7020  	fs.mu.Unlock()
  7021  
  7022  	if cb != nil && purged > 0 {
  7023  		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
  7024  	}
  7025  
  7026  	return purged, err
  7027  }
  7028  
  7029  // Will completely reset our store.
  7030  func (fs *fileStore) reset() error {
  7031  	fs.mu.Lock()
  7032  	if fs.closed {
  7033  		fs.mu.Unlock()
  7034  		return ErrStoreClosed
  7035  	}
  7036  	if fs.sips > 0 {
  7037  		fs.mu.Unlock()
  7038  		return ErrStoreSnapshotInProgress
  7039  	}
  7040  
  7041  	var purged, bytes uint64
  7042  	cb := fs.scb
  7043  
  7044  	for _, mb := range fs.blks {
  7045  		mb.mu.Lock()
  7046  		purged += mb.msgs
  7047  		bytes += mb.bytes
  7048  		mb.dirtyCloseWithRemove(true)
  7049  		mb.mu.Unlock()
  7050  	}
  7051  
  7052  	// Reset
  7053  	fs.state.FirstSeq = 0
  7054  	fs.state.FirstTime = time.Time{}
  7055  	fs.state.LastSeq = 0
  7056  	fs.state.LastTime = time.Now().UTC()
  7057  	// Update msgs and bytes.
  7058  	fs.state.Msgs = 0
  7059  	fs.state.Bytes = 0
  7060  
  7061  	// Reset blocks.
  7062  	fs.blks, fs.lmb = nil, nil
  7063  
  7064  	// Reset subject mappings.
  7065  	fs.psim, fs.tsl = fs.psim.Empty(), 0
  7066  	fs.bim = make(map[uint32]*msgBlock)
  7067  
  7068  	// If we purged anything, make sure we kick flush state loop.
  7069  	if purged > 0 {
  7070  		fs.dirty++
  7071  	}
  7072  
  7073  	fs.mu.Unlock()
  7074  
  7075  	if cb != nil {
  7076  		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
  7077  	}
  7078  
  7079  	return nil
  7080  }
  7081  
  7082  // Truncate will truncate a stream store up to seq. Sequence needs to be valid.
  7083  func (fs *fileStore) Truncate(seq uint64) error {
  7084  	// Check for request to reset.
  7085  	if seq == 0 {
  7086  		return fs.reset()
  7087  	}
  7088  
  7089  	fs.mu.Lock()
  7090  
  7091  	if fs.closed {
  7092  		fs.mu.Unlock()
  7093  		return ErrStoreClosed
  7094  	}
  7095  	if fs.sips > 0 {
  7096  		fs.mu.Unlock()
  7097  		return ErrStoreSnapshotInProgress
  7098  	}
  7099  
  7100  	nlmb := fs.selectMsgBlock(seq)
  7101  	if nlmb == nil {
  7102  		fs.mu.Unlock()
  7103  		return ErrInvalidSequence
  7104  	}
  7105  	lsm, _, _ := nlmb.fetchMsg(seq, nil)
  7106  	if lsm == nil {
  7107  		fs.mu.Unlock()
  7108  		return ErrInvalidSequence
  7109  	}
  7110  
  7111  	// Set lmb to nlmb and make sure writeable.
  7112  	fs.lmb = nlmb
  7113  	if err := nlmb.enableForWriting(fs.fip); err != nil {
  7114  		fs.mu.Unlock()
  7115  		return err
  7116  	}
  7117  
  7118  	var purged, bytes uint64
  7119  
  7120  	// Truncate our new last message block.
  7121  	nmsgs, nbytes, err := nlmb.truncate(lsm)
  7122  	if err != nil {
  7123  		fs.mu.Unlock()
  7124  		return fmt.Errorf("nlmb.truncate: %w", err)
  7125  	}
  7126  	// Account for the truncated msgs and bytes.
  7127  	purged += nmsgs
  7128  	bytes += nbytes
  7129  
  7130  	// Remove any left over msg blocks.
  7131  	getLastMsgBlock := func() *msgBlock { return fs.blks[len(fs.blks)-1] }
  7132  	for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() {
  7133  		mb.mu.Lock()
  7134  		purged += mb.msgs
  7135  		bytes += mb.bytes
  7136  		fs.removeMsgBlock(mb)
  7137  		mb.mu.Unlock()
  7138  	}
  7139  
  7140  	// Reset last.
  7141  	fs.state.LastSeq = lsm.seq
  7142  	fs.state.LastTime = time.Unix(0, lsm.ts).UTC()
  7143  	// Update msgs and bytes.
  7144  	if purged > fs.state.Msgs {
  7145  		purged = fs.state.Msgs
  7146  	}
  7147  	fs.state.Msgs -= purged
  7148  	if bytes > fs.state.Bytes {
  7149  		bytes = fs.state.Bytes
  7150  	}
  7151  	fs.state.Bytes -= bytes
  7152  
  7153  	// Reset our subject lookup info.
  7154  	fs.resetGlobalPerSubjectInfo()
  7155  
  7156  	fs.dirty++
  7157  
  7158  	cb := fs.scb
  7159  	fs.mu.Unlock()
  7160  
  7161  	if cb != nil {
  7162  		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
  7163  	}
  7164  
  7165  	return nil
  7166  }
  7167  
  7168  func (fs *fileStore) lastSeq() uint64 {
  7169  	fs.mu.RLock()
  7170  	seq := fs.state.LastSeq
  7171  	fs.mu.RUnlock()
  7172  	return seq
  7173  }
  7174  
  7175  // Returns number of msg blks.
  7176  func (fs *fileStore) numMsgBlocks() int {
  7177  	fs.mu.RLock()
  7178  	defer fs.mu.RUnlock()
  7179  	return len(fs.blks)
  7180  }
  7181  
  7182  // Will add a new msgBlock.
  7183  // Lock should be held.
  7184  func (fs *fileStore) addMsgBlock(mb *msgBlock) {
  7185  	fs.blks = append(fs.blks, mb)
  7186  	fs.lmb = mb
  7187  	fs.bim[mb.index] = mb
  7188  }
  7189  
  7190  // Remove from our list of blks.
  7191  // Both locks should be held.
  7192  func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) {
  7193  	// Remove from list.
  7194  	for i, omb := range fs.blks {
  7195  		if mb == omb {
  7196  			fs.dirty++
  7197  			blks := append(fs.blks[:i], fs.blks[i+1:]...)
  7198  			fs.blks = copyMsgBlocks(blks)
  7199  			if fs.bim != nil {
  7200  				delete(fs.bim, mb.index)
  7201  			}
  7202  			break
  7203  		}
  7204  	}
  7205  }
  7206  
  7207  // Removes the msgBlock
  7208  // Both locks should be held.
  7209  func (fs *fileStore) removeMsgBlock(mb *msgBlock) {
  7210  	mb.dirtyCloseWithRemove(true)
  7211  	fs.removeMsgBlockFromList(mb)
  7212  	// Check for us being last message block
  7213  	if mb == fs.lmb {
  7214  		lseq, lts := atomic.LoadUint64(&mb.last.seq), mb.last.ts
  7215  		// Creating a new message write block requires that the lmb lock is not held.
  7216  		mb.mu.Unlock()
  7217  		// Write the tombstone to remember since this was last block.
  7218  		if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
  7219  			lmb.writeTombstone(lseq, lts)
  7220  		}
  7221  		mb.mu.Lock()
  7222  	}
  7223  }
  7224  
  7225  // Called by purge to simply get rid of the cache and close our fds.
  7226  // Lock should not be held.
  7227  func (mb *msgBlock) dirtyClose() {
  7228  	mb.mu.Lock()
  7229  	defer mb.mu.Unlock()
  7230  	mb.dirtyCloseWithRemove(false)
  7231  }
  7232  
  7233  // Should be called with lock held.
  7234  func (mb *msgBlock) dirtyCloseWithRemove(remove bool) {
  7235  	if mb == nil {
  7236  		return
  7237  	}
  7238  	// Stop cache expiration timer.
  7239  	if mb.ctmr != nil {
  7240  		mb.ctmr.Stop()
  7241  		mb.ctmr = nil
  7242  	}
  7243  	// Clear any tracking by subject.
  7244  	mb.fss = nil
  7245  	// Close cache
  7246  	mb.clearCacheAndOffset()
  7247  	// Quit our loops.
  7248  	if mb.qch != nil {
  7249  		close(mb.qch)
  7250  		mb.qch = nil
  7251  	}
  7252  	if mb.mfd != nil {
  7253  		mb.mfd.Close()
  7254  		mb.mfd = nil
  7255  	}
  7256  	if remove {
  7257  		if mb.mfn != _EMPTY_ {
  7258  			os.Remove(mb.mfn)
  7259  			mb.mfn = _EMPTY_
  7260  		}
  7261  		if mb.kfn != _EMPTY_ {
  7262  			os.Remove(mb.kfn)
  7263  		}
  7264  	}
  7265  }
  7266  
  7267  // Remove a seq from the fss and select new first.
  7268  // Lock should be held.
  7269  func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) {
  7270  	mb.ensurePerSubjectInfoLoaded()
  7271  	ss := mb.fss[subj]
  7272  	if ss == nil {
  7273  		return
  7274  	}
  7275  
  7276  	if ss.Msgs == 1 {
  7277  		delete(mb.fss, subj)
  7278  		return
  7279  	}
  7280  
  7281  	ss.Msgs--
  7282  
  7283  	// Only one left.
  7284  	if ss.Msgs == 1 {
  7285  		if seq == ss.Last {
  7286  			ss.Last = ss.First
  7287  		} else {
  7288  			ss.First = ss.Last
  7289  		}
  7290  		ss.firstNeedsUpdate = false
  7291  		return
  7292  	}
  7293  
  7294  	// We can lazily calculate the first sequence when needed.
  7295  	ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate
  7296  }
  7297  
  7298  // Will recalulate the first sequence for this subject in this block.
  7299  // Will avoid slower path message lookups and scan the cache directly instead.
  7300  func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) {
  7301  	// Need to make sure messages are loaded.
  7302  	if mb.cacheNotLoaded() {
  7303  		if err := mb.loadMsgsWithLock(); err != nil {
  7304  			return
  7305  		}
  7306  	}
  7307  
  7308  	// Mark first as updated.
  7309  	ss.firstNeedsUpdate = false
  7310  
  7311  	startSlot := int(startSeq - mb.cache.fseq)
  7312  	if startSlot >= len(mb.cache.idx) {
  7313  		ss.First = ss.Last
  7314  		return
  7315  	} else if startSlot < 0 {
  7316  		startSlot = 0
  7317  	}
  7318  
  7319  	var le = binary.LittleEndian
  7320  	for slot, fseq := startSlot, atomic.LoadUint64(&mb.first.seq); slot < len(mb.cache.idx); slot++ {
  7321  		bi := mb.cache.idx[slot] &^ hbit
  7322  		if bi == dbit {
  7323  			// delete marker so skip.
  7324  			continue
  7325  		}
  7326  		li := int(bi) - mb.cache.off
  7327  		if li >= len(mb.cache.buf) {
  7328  			ss.First = ss.Last
  7329  			return
  7330  		}
  7331  		buf := mb.cache.buf[li:]
  7332  		hdr := buf[:msgHdrSize]
  7333  		slen := int(le.Uint16(hdr[20:]))
  7334  		if subj == bytesToString(buf[msgHdrSize:msgHdrSize+slen]) {
  7335  			seq := le.Uint64(hdr[4:])
  7336  			if seq < fseq || seq&ebit != 0 || mb.dmap.Exists(seq) {
  7337  				continue
  7338  			}
  7339  			ss.First = seq
  7340  			return
  7341  		}
  7342  	}
  7343  }
  7344  
  7345  // Lock should be held.
  7346  func (fs *fileStore) resetGlobalPerSubjectInfo() {
  7347  	// Clear any global subject state.
  7348  	fs.psim, fs.tsl = fs.psim.Empty(), 0
  7349  	for _, mb := range fs.blks {
  7350  		fs.populateGlobalPerSubjectInfo(mb)
  7351  	}
  7352  }
  7353  
  7354  // Lock should be held.
  7355  func (mb *msgBlock) resetPerSubjectInfo() error {
  7356  	mb.fss = nil
  7357  	return mb.generatePerSubjectInfo()
  7358  }
  7359  
  7360  // generatePerSubjectInfo will generate the per subject info via the raw msg block.
  7361  // Lock should be held.
  7362  func (mb *msgBlock) generatePerSubjectInfo() error {
  7363  	// Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info.
  7364  	if mb.msgs == 0 {
  7365  		return nil
  7366  	}
  7367  
  7368  	if mb.cacheNotLoaded() {
  7369  		if err := mb.loadMsgsWithLock(); err != nil {
  7370  			return err
  7371  		}
  7372  		// indexCacheBuf can produce fss now, so if non-nil we are good.
  7373  		if mb.fss != nil {
  7374  			return nil
  7375  		}
  7376  	}
  7377  
  7378  	// Create new one regardless.
  7379  	mb.fss = make(map[string]*SimpleState)
  7380  
  7381  	var smv StoreMsg
  7382  	fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  7383  	for seq := fseq; seq <= lseq; seq++ {
  7384  		sm, err := mb.cacheLookup(seq, &smv)
  7385  		if err != nil {
  7386  			// Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state.
  7387  			if err == ErrStoreMsgNotFound || err == errDeletedMsg {
  7388  				continue
  7389  			}
  7390  			if err == errNoCache {
  7391  				return nil
  7392  			}
  7393  			return err
  7394  		}
  7395  		if sm != nil && len(sm.subj) > 0 {
  7396  			if ss := mb.fss[sm.subj]; ss != nil {
  7397  				ss.Msgs++
  7398  				ss.Last = seq
  7399  			} else {
  7400  				mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
  7401  			}
  7402  		}
  7403  	}
  7404  
  7405  	if len(mb.fss) > 0 {
  7406  		// Make sure we run the cache expire timer.
  7407  		mb.llts = time.Now().UnixNano()
  7408  		mb.startCacheExpireTimer()
  7409  	}
  7410  	return nil
  7411  }
  7412  
  7413  // Helper to make sure fss loaded if we are tracking.
  7414  // Lock should be held
  7415  func (mb *msgBlock) ensurePerSubjectInfoLoaded() error {
  7416  	if mb.fss != nil || mb.noTrack {
  7417  		return nil
  7418  	}
  7419  	if mb.msgs == 0 {
  7420  		mb.fss = make(map[string]*SimpleState)
  7421  		return nil
  7422  	}
  7423  	return mb.generatePerSubjectInfo()
  7424  }
  7425  
  7426  // Called on recovery to populate the global psim state.
  7427  // Lock should be held.
  7428  func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) {
  7429  	mb.mu.Lock()
  7430  	defer mb.mu.Unlock()
  7431  
  7432  	if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
  7433  		return
  7434  	}
  7435  
  7436  	// Now populate psim.
  7437  	for subj, ss := range mb.fss {
  7438  		if len(subj) > 0 {
  7439  			bsubj := stringToBytes(subj)
  7440  			if info, ok := fs.psim.Find(bsubj); ok {
  7441  				info.total += ss.Msgs
  7442  				if mb.index > info.lblk {
  7443  					info.lblk = mb.index
  7444  				}
  7445  			} else {
  7446  				fs.psim.Insert(bsubj, psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index})
  7447  				fs.tsl += len(subj)
  7448  			}
  7449  		}
  7450  	}
  7451  }
  7452  
  7453  // Close the message block.
  7454  func (mb *msgBlock) close(sync bool) {
  7455  	if mb == nil {
  7456  		return
  7457  	}
  7458  	mb.mu.Lock()
  7459  	defer mb.mu.Unlock()
  7460  
  7461  	if mb.closed {
  7462  		return
  7463  	}
  7464  
  7465  	// Stop cache expiration timer.
  7466  	if mb.ctmr != nil {
  7467  		mb.ctmr.Stop()
  7468  		mb.ctmr = nil
  7469  	}
  7470  
  7471  	// Clear fss.
  7472  	mb.fss = nil
  7473  
  7474  	// Close cache
  7475  	mb.clearCacheAndOffset()
  7476  	// Quit our loops.
  7477  	if mb.qch != nil {
  7478  		close(mb.qch)
  7479  		mb.qch = nil
  7480  	}
  7481  	if mb.mfd != nil {
  7482  		if sync {
  7483  			mb.mfd.Sync()
  7484  		}
  7485  		mb.mfd.Close()
  7486  	}
  7487  	mb.mfd = nil
  7488  	// Mark as closed.
  7489  	mb.closed = true
  7490  }
  7491  
  7492  func (fs *fileStore) closeAllMsgBlocks(sync bool) {
  7493  	for _, mb := range fs.blks {
  7494  		mb.close(sync)
  7495  	}
  7496  }
  7497  
  7498  func (fs *fileStore) Delete() error {
  7499  	if fs.isClosed() {
  7500  		// Always attempt to remove since we could have been closed beforehand.
  7501  		os.RemoveAll(fs.fcfg.StoreDir)
  7502  		// Since we did remove, if we did have anything remaining make sure to
  7503  		// call into any storage updates that had been registered.
  7504  		fs.mu.Lock()
  7505  		cb, msgs, bytes := fs.scb, int64(fs.state.Msgs), int64(fs.state.Bytes)
  7506  		// Guard against double accounting if called twice.
  7507  		fs.state.Msgs, fs.state.Bytes = 0, 0
  7508  		fs.mu.Unlock()
  7509  		if msgs > 0 && cb != nil {
  7510  			cb(-msgs, -bytes, 0, _EMPTY_)
  7511  		}
  7512  		return ErrStoreClosed
  7513  	}
  7514  
  7515  	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
  7516  	// If purge directory still exists then we need to wait
  7517  	// in place and remove since rename would fail.
  7518  	if _, err := os.Stat(pdir); err == nil {
  7519  		os.RemoveAll(pdir)
  7520  	}
  7521  
  7522  	// Do Purge() since if we have lots of blocks uses a mv/rename.
  7523  	fs.Purge()
  7524  
  7525  	if err := fs.stop(false); err != nil {
  7526  		return err
  7527  	}
  7528  
  7529  	// Make sure we will not try to recover if killed before removal below completes.
  7530  	if err := os.Remove(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)); err != nil {
  7531  		return err
  7532  	}
  7533  	// Now move into different directory with "." prefix.
  7534  	ndir := filepath.Join(filepath.Dir(fs.fcfg.StoreDir), tsep+filepath.Base(fs.fcfg.StoreDir))
  7535  	if err := os.Rename(fs.fcfg.StoreDir, ndir); err != nil {
  7536  		return err
  7537  	}
  7538  	// Do this in separate Go routine in case lots of blocks.
  7539  	// Purge above protects us as does the removal of meta artifacts above.
  7540  	go func() {
  7541  		err := os.RemoveAll(ndir)
  7542  		if err == nil {
  7543  			return
  7544  		}
  7545  		ttl := time.Now().Add(time.Second)
  7546  		for time.Now().Before(ttl) {
  7547  			time.Sleep(10 * time.Millisecond)
  7548  			if err = os.RemoveAll(ndir); err == nil {
  7549  				return
  7550  			}
  7551  		}
  7552  	}()
  7553  
  7554  	return nil
  7555  }
  7556  
  7557  // Lock should be held.
  7558  func (fs *fileStore) setSyncTimer() {
  7559  	if fs.syncTmr != nil {
  7560  		fs.syncTmr.Reset(fs.fcfg.SyncInterval)
  7561  	} else {
  7562  		fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
  7563  	}
  7564  }
  7565  
  7566  // Lock should be held.
  7567  func (fs *fileStore) cancelSyncTimer() {
  7568  	if fs.syncTmr != nil {
  7569  		fs.syncTmr.Stop()
  7570  		fs.syncTmr = nil
  7571  	}
  7572  }
  7573  
  7574  const (
  7575  	fullStateMagic   = uint8(11)
  7576  	fullStateVersion = uint8(1)
  7577  )
  7578  
  7579  // This go routine runs and receives kicks to write out our full stream state index.
  7580  // This will get kicked when we create a new block or when we delete a block in general.
  7581  // This is also called during Stop().
  7582  func (fs *fileStore) flushStreamStateLoop(qch, done chan struct{}) {
  7583  	// Signal we are done on exit.
  7584  	defer close(done)
  7585  
  7586  	// Make sure we do not try to write these out too fast.
  7587  	const writeThreshold = 2 * time.Minute
  7588  	t := time.NewTicker(writeThreshold)
  7589  	defer t.Stop()
  7590  
  7591  	for {
  7592  		select {
  7593  		case <-t.C:
  7594  			fs.writeFullState()
  7595  		case <-qch:
  7596  			return
  7597  		}
  7598  	}
  7599  }
  7600  
  7601  // Helper since unixnano of zero time undefined.
  7602  func timestampNormalized(t time.Time) int64 {
  7603  	if t.IsZero() {
  7604  		return 0
  7605  	}
  7606  	return t.UnixNano()
  7607  }
  7608  
  7609  // writeFullState will proceed to write the full meta state iff not complex and time consuming.
  7610  // Since this is for quick recovery it is optional and should not block/stall normal operations.
  7611  func (fs *fileStore) writeFullState() error {
  7612  	return fs._writeFullState(false)
  7613  }
  7614  
  7615  // forceWriteFullState will proceed to write the full meta state. This should only be called by stop()
  7616  func (fs *fileStore) forceWriteFullState() error {
  7617  	return fs._writeFullState(true)
  7618  }
  7619  
  7620  // This will write the full binary state for the stream.
  7621  // This plus everything new since last hash will be the total recovered state.
  7622  // This state dump will have the following.
  7623  // 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp)
  7624  // 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present.
  7625  // 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset).
  7626  // 4. Last block index and hash of record inclusive to this stream state.
  7627  func (fs *fileStore) _writeFullState(force bool) error {
  7628  	fs.mu.Lock()
  7629  	if fs.closed || fs.dirty == 0 {
  7630  		fs.mu.Unlock()
  7631  		return nil
  7632  	}
  7633  
  7634  	// For calculating size and checking time costs for non forced calls.
  7635  	numSubjects := fs.numSubjects()
  7636  
  7637  	// If we are not being forced to write out our state, check the complexity for time costs as to not
  7638  	// block or stall normal operations.
  7639  	// We will base off of number of subjects and interior deletes. A very large number of msg blocks could also
  7640  	// be used, but for next server version will redo all meta handling to be disk based. So this is temporary.
  7641  	if !force {
  7642  		const numThreshold = 1_000_000
  7643  		// Calculate interior deletes.
  7644  		var numDeleted int
  7645  		if fs.state.LastSeq > fs.state.FirstSeq {
  7646  			numDeleted = int((fs.state.LastSeq - fs.state.FirstSeq + 1) - fs.state.Msgs)
  7647  		}
  7648  		if numSubjects > numThreshold || numDeleted > numThreshold {
  7649  			fs.mu.Unlock()
  7650  			return errStateTooBig
  7651  		}
  7652  	}
  7653  
  7654  	// We track this through subsequent runs to get an avg per blk used for subsequent runs.
  7655  	avgDmapLen := fs.adml
  7656  	// If first time through could be 0
  7657  	if avgDmapLen == 0 && ((fs.state.LastSeq-fs.state.FirstSeq+1)-fs.state.Msgs) > 0 {
  7658  		avgDmapLen = 1024
  7659  	}
  7660  
  7661  	// Calculate and estimate of the uper bound on the  size to avoid multiple allocations.
  7662  	sz := hdrLen + // Magic and Version
  7663  		(binary.MaxVarintLen64 * 6) + // FS data
  7664  		binary.MaxVarintLen64 + fs.tsl + // NumSubjects + total subject length
  7665  		numSubjects*(binary.MaxVarintLen64*4) + // psi record
  7666  		binary.MaxVarintLen64 + // Num blocks.
  7667  		len(fs.blks)*((binary.MaxVarintLen64*7)+avgDmapLen) + // msg blocks, avgDmapLen is est for dmaps
  7668  		binary.MaxVarintLen64 + 8 + 8 // last index + record checksum + full state checksum
  7669  
  7670  	// Do 4k on stack if possible.
  7671  	const ssz = 4 * 1024
  7672  	var buf []byte
  7673  
  7674  	if sz <= ssz {
  7675  		var _buf [ssz]byte
  7676  		buf, sz = _buf[0:hdrLen:ssz], ssz
  7677  	} else {
  7678  		buf = make([]byte, hdrLen, sz)
  7679  	}
  7680  
  7681  	buf[0], buf[1] = fullStateMagic, fullStateVersion
  7682  	buf = binary.AppendUvarint(buf, fs.state.Msgs)
  7683  	buf = binary.AppendUvarint(buf, fs.state.Bytes)
  7684  	buf = binary.AppendUvarint(buf, fs.state.FirstSeq)
  7685  	buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime))
  7686  	buf = binary.AppendUvarint(buf, fs.state.LastSeq)
  7687  	buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime))
  7688  
  7689  	// Do per subject information map if applicable.
  7690  	buf = binary.AppendUvarint(buf, uint64(numSubjects))
  7691  	if numSubjects > 0 {
  7692  		fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) {
  7693  			buf = binary.AppendUvarint(buf, uint64(len(subj)))
  7694  			buf = append(buf, subj...)
  7695  			buf = binary.AppendUvarint(buf, psi.total)
  7696  			buf = binary.AppendUvarint(buf, uint64(psi.fblk))
  7697  			if psi.total > 1 {
  7698  				buf = binary.AppendUvarint(buf, uint64(psi.lblk))
  7699  			}
  7700  		})
  7701  	}
  7702  
  7703  	// Now walk all blocks and write out first and last and optional dmap encoding.
  7704  	var lbi uint32
  7705  	var lchk [8]byte
  7706  
  7707  	nb := len(fs.blks)
  7708  	buf = binary.AppendUvarint(buf, uint64(nb))
  7709  
  7710  	// Use basetime to save some space.
  7711  	baseTime := timestampNormalized(fs.state.FirstTime)
  7712  	var scratch [8 * 1024]byte
  7713  
  7714  	// Track the state as represented by the mbs.
  7715  	var mstate StreamState
  7716  
  7717  	var dmapTotalLen int
  7718  	for _, mb := range fs.blks {
  7719  		mb.mu.RLock()
  7720  		buf = binary.AppendUvarint(buf, uint64(mb.index))
  7721  		buf = binary.AppendUvarint(buf, mb.bytes)
  7722  		buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.first.seq))
  7723  		buf = binary.AppendVarint(buf, mb.first.ts-baseTime)
  7724  		buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.last.seq))
  7725  		buf = binary.AppendVarint(buf, mb.last.ts-baseTime)
  7726  
  7727  		numDeleted := mb.dmap.Size()
  7728  		buf = binary.AppendUvarint(buf, uint64(numDeleted))
  7729  		if numDeleted > 0 {
  7730  			dmap, _ := mb.dmap.Encode(scratch[:0])
  7731  			dmapTotalLen += len(dmap)
  7732  			buf = append(buf, dmap...)
  7733  		}
  7734  		// If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index.
  7735  		// We use this to quickly open this file on recovery.
  7736  		if mb == fs.lmb {
  7737  			lbi = mb.index
  7738  			mb.ensureLastChecksumLoaded()
  7739  			copy(lchk[0:], mb.lchk[:])
  7740  		}
  7741  		updateTrackingState(&mstate, mb)
  7742  		mb.mu.RUnlock()
  7743  	}
  7744  	if dmapTotalLen > 0 {
  7745  		fs.adml = dmapTotalLen / len(fs.blks)
  7746  	}
  7747  
  7748  	// Place block index and hash onto the end.
  7749  	buf = binary.AppendUvarint(buf, uint64(lbi))
  7750  	buf = append(buf, lchk[:]...)
  7751  
  7752  	// Encrypt if needed.
  7753  	if fs.prf != nil {
  7754  		if err := fs.setupAEK(); err != nil {
  7755  			fs.mu.Unlock()
  7756  			return err
  7757  		}
  7758  		nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead())
  7759  		if n, err := rand.Read(nonce); err != nil {
  7760  			return err
  7761  		} else if n != len(nonce) {
  7762  			return fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce))
  7763  		}
  7764  		buf = fs.aek.Seal(nonce, nonce, buf, nil)
  7765  	}
  7766  
  7767  	fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
  7768  
  7769  	fs.hh.Reset()
  7770  	fs.hh.Write(buf)
  7771  	buf = fs.hh.Sum(buf)
  7772  
  7773  	// Snapshot prior dirty count.
  7774  	priorDirty := fs.dirty
  7775  
  7776  	statesEqual := trackingStatesEqual(&fs.state, &mstate) || len(fs.blks) > 0
  7777  	// Release lock.
  7778  	fs.mu.Unlock()
  7779  
  7780  	// Check consistency here.
  7781  	if !statesEqual {
  7782  		fs.warn("Stream state encountered internal inconsistency on write")
  7783  		// Rebuild our fs state from the mb state.
  7784  		fs.rebuildState(nil)
  7785  		return errCorruptState
  7786  	}
  7787  
  7788  	if cap(buf) > sz {
  7789  		fs.debug("WriteFullState reallocated from %d to %d", sz, cap(buf))
  7790  	}
  7791  
  7792  	// Write to a tmp file and rename.
  7793  	const tmpPre = streamStreamStateFile + tsep
  7794  	f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre)
  7795  	if err != nil {
  7796  		return err
  7797  	}
  7798  	tmpName := f.Name()
  7799  	defer os.Remove(tmpName)
  7800  	if _, err = f.Write(buf); err == nil && fs.fcfg.SyncAlways {
  7801  		f.Sync()
  7802  	}
  7803  	f.Close()
  7804  	if err != nil {
  7805  		return err
  7806  	}
  7807  
  7808  	// Rename into position under our lock, clear prior dirty pending on success.
  7809  	fs.mu.Lock()
  7810  	if !fs.closed {
  7811  		if err := os.Rename(tmpName, fn); err != nil {
  7812  			fs.mu.Unlock()
  7813  			return err
  7814  		}
  7815  		fs.dirty -= priorDirty
  7816  	}
  7817  	fs.mu.Unlock()
  7818  
  7819  	return nil
  7820  }
  7821  
  7822  // Stop the current filestore.
  7823  func (fs *fileStore) Stop() error {
  7824  	return fs.stop(true)
  7825  }
  7826  
  7827  // Stop the current filestore.
  7828  func (fs *fileStore) stop(writeState bool) error {
  7829  	fs.mu.Lock()
  7830  	if fs.closed || fs.closing {
  7831  		fs.mu.Unlock()
  7832  		return ErrStoreClosed
  7833  	}
  7834  
  7835  	// Mark as closing. Do before releasing the lock to writeFullState
  7836  	// so we don't end up with this function running more than once.
  7837  	fs.closing = true
  7838  
  7839  	if writeState {
  7840  		fs.checkAndFlushAllBlocks()
  7841  	}
  7842  	fs.closeAllMsgBlocks(false)
  7843  
  7844  	fs.cancelSyncTimer()
  7845  	fs.cancelAgeChk()
  7846  
  7847  	// Release the state flusher loop.
  7848  	if fs.qch != nil {
  7849  		close(fs.qch)
  7850  		fs.qch = nil
  7851  	}
  7852  
  7853  	if writeState {
  7854  		// Wait for the state flush loop to exit.
  7855  		fsld := fs.fsld
  7856  		fs.mu.Unlock()
  7857  		<-fsld
  7858  		// Write full state if needed. If not dirty this is a no-op.
  7859  		fs.forceWriteFullState()
  7860  		fs.mu.Lock()
  7861  	}
  7862  
  7863  	// Mark as closed. Last message block needs to be cleared after
  7864  	// writeFullState has completed.
  7865  	fs.closed = true
  7866  	fs.lmb = nil
  7867  
  7868  	// We should update the upper usage layer on a stop.
  7869  	cb, bytes := fs.scb, int64(fs.state.Bytes)
  7870  	fs.mu.Unlock()
  7871  
  7872  	fs.cmu.Lock()
  7873  	var _cfs [256]ConsumerStore
  7874  	cfs := append(_cfs[:0], fs.cfs...)
  7875  	fs.cfs = nil
  7876  	fs.cmu.Unlock()
  7877  
  7878  	for _, o := range cfs {
  7879  		o.Stop()
  7880  	}
  7881  
  7882  	if bytes > 0 && cb != nil {
  7883  		cb(0, -bytes, 0, _EMPTY_)
  7884  	}
  7885  
  7886  	return nil
  7887  }
  7888  
  7889  const errFile = "errors.txt"
  7890  
  7891  // Stream our snapshot through S2 compression and tar.
  7892  func (fs *fileStore) streamSnapshot(w io.WriteCloser, includeConsumers bool) {
  7893  	defer w.Close()
  7894  
  7895  	enc := s2.NewWriter(w)
  7896  	defer enc.Close()
  7897  
  7898  	tw := tar.NewWriter(enc)
  7899  	defer tw.Close()
  7900  
  7901  	defer func() {
  7902  		fs.mu.Lock()
  7903  		fs.sips--
  7904  		fs.mu.Unlock()
  7905  	}()
  7906  
  7907  	modTime := time.Now().UTC()
  7908  
  7909  	writeFile := func(name string, buf []byte) error {
  7910  		hdr := &tar.Header{
  7911  			Name:    name,
  7912  			Mode:    0600,
  7913  			ModTime: modTime,
  7914  			Uname:   "nats",
  7915  			Gname:   "nats",
  7916  			Size:    int64(len(buf)),
  7917  			Format:  tar.FormatPAX,
  7918  		}
  7919  		if err := tw.WriteHeader(hdr); err != nil {
  7920  			return err
  7921  		}
  7922  		if _, err := tw.Write(buf); err != nil {
  7923  			return err
  7924  		}
  7925  		return nil
  7926  	}
  7927  
  7928  	writeErr := func(err string) {
  7929  		writeFile(errFile, []byte(err))
  7930  	}
  7931  
  7932  	fs.mu.Lock()
  7933  	blks := fs.blks
  7934  	// Grab our general meta data.
  7935  	// We do this now instead of pulling from files since they could be encrypted.
  7936  	meta, err := json.Marshal(fs.cfg)
  7937  	if err != nil {
  7938  		fs.mu.Unlock()
  7939  		writeErr(fmt.Sprintf("Could not gather stream meta file: %v", err))
  7940  		return
  7941  	}
  7942  	hh := fs.hh
  7943  	hh.Reset()
  7944  	hh.Write(meta)
  7945  	sum := []byte(hex.EncodeToString(fs.hh.Sum(nil)))
  7946  	fs.mu.Unlock()
  7947  
  7948  	// Meta first.
  7949  	if writeFile(JetStreamMetaFile, meta) != nil {
  7950  		return
  7951  	}
  7952  	if writeFile(JetStreamMetaFileSum, sum) != nil {
  7953  		return
  7954  	}
  7955  
  7956  	// Can't use join path here, tar only recognizes relative paths with forward slashes.
  7957  	msgPre := msgDir + "/"
  7958  	var bbuf []byte
  7959  
  7960  	const minLen = 32
  7961  	sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
  7962  	if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen {
  7963  		if fs.aek != nil {
  7964  			ns := fs.aek.NonceSize()
  7965  			buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil)
  7966  			if err == nil {
  7967  				// Redo hash checksum at end on plaintext.
  7968  				fs.mu.Lock()
  7969  				hh.Reset()
  7970  				hh.Write(buf)
  7971  				buf = fs.hh.Sum(buf)
  7972  				fs.mu.Unlock()
  7973  			}
  7974  		}
  7975  		if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil {
  7976  			return
  7977  		}
  7978  	}
  7979  
  7980  	// Now do messages themselves.
  7981  	for _, mb := range blks {
  7982  		if mb.pendingWriteSize() > 0 {
  7983  			mb.flushPendingMsgs()
  7984  		}
  7985  		mb.mu.Lock()
  7986  		// We could stream but don't want to hold the lock and prevent changes, so just read in and
  7987  		// release the lock for now.
  7988  		bbuf, err = mb.loadBlock(bbuf)
  7989  		if err != nil {
  7990  			mb.mu.Unlock()
  7991  			writeErr(fmt.Sprintf("Could not read message block [%d]: %v", mb.index, err))
  7992  			return
  7993  		}
  7994  		// Check for encryption.
  7995  		if mb.bek != nil && len(bbuf) > 0 {
  7996  			rbek, err := genBlockEncryptionKey(fs.fcfg.Cipher, mb.seed, mb.nonce)
  7997  			if err != nil {
  7998  				mb.mu.Unlock()
  7999  				writeErr(fmt.Sprintf("Could not create encryption key for message block [%d]: %v", mb.index, err))
  8000  				return
  8001  			}
  8002  			rbek.XORKeyStream(bbuf, bbuf)
  8003  		}
  8004  		// Check for compression.
  8005  		if bbuf, err = mb.decompressIfNeeded(bbuf); err != nil {
  8006  			mb.mu.Unlock()
  8007  			writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err))
  8008  			return
  8009  		}
  8010  		mb.mu.Unlock()
  8011  
  8012  		// Do this one unlocked.
  8013  		if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil {
  8014  			return
  8015  		}
  8016  	}
  8017  
  8018  	// Bail if no consumers requested.
  8019  	if !includeConsumers {
  8020  		return
  8021  	}
  8022  
  8023  	// Do consumers' state last.
  8024  	fs.cmu.RLock()
  8025  	cfs := fs.cfs
  8026  	fs.cmu.RUnlock()
  8027  
  8028  	for _, cs := range cfs {
  8029  		o, ok := cs.(*consumerFileStore)
  8030  		if !ok {
  8031  			continue
  8032  		}
  8033  		o.mu.Lock()
  8034  		// Grab our general meta data.
  8035  		// We do this now instead of pulling from files since they could be encrypted.
  8036  		meta, err := json.Marshal(o.cfg)
  8037  		if err != nil {
  8038  			o.mu.Unlock()
  8039  			writeErr(fmt.Sprintf("Could not gather consumer meta file for %q: %v", o.name, err))
  8040  			return
  8041  		}
  8042  		o.hh.Reset()
  8043  		o.hh.Write(meta)
  8044  		sum := []byte(hex.EncodeToString(o.hh.Sum(nil)))
  8045  
  8046  		// We can have the running state directly encoded now.
  8047  		state, err := o.encodeState()
  8048  		if err != nil {
  8049  			o.mu.Unlock()
  8050  			writeErr(fmt.Sprintf("Could not encode consumer state for %q: %v", o.name, err))
  8051  			return
  8052  		}
  8053  		odirPre := filepath.Join(consumerDir, o.name)
  8054  		o.mu.Unlock()
  8055  
  8056  		// Write all the consumer files.
  8057  		if writeFile(filepath.Join(odirPre, JetStreamMetaFile), meta) != nil {
  8058  			return
  8059  		}
  8060  		if writeFile(filepath.Join(odirPre, JetStreamMetaFileSum), sum) != nil {
  8061  			return
  8062  		}
  8063  		writeFile(filepath.Join(odirPre, consumerState), state)
  8064  	}
  8065  }
  8066  
  8067  // Create a snapshot of this stream and its consumer's state along with messages.
  8068  func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) {
  8069  	fs.mu.Lock()
  8070  	if fs.closed {
  8071  		fs.mu.Unlock()
  8072  		return nil, ErrStoreClosed
  8073  	}
  8074  	// Only allow one at a time.
  8075  	if fs.sips > 0 {
  8076  		fs.mu.Unlock()
  8077  		return nil, ErrStoreSnapshotInProgress
  8078  	}
  8079  	// Mark us as snapshotting
  8080  	fs.sips += 1
  8081  	fs.mu.Unlock()
  8082  
  8083  	if checkMsgs {
  8084  		ld := fs.checkMsgs()
  8085  		if ld != nil && len(ld.Msgs) > 0 {
  8086  			return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs))
  8087  		}
  8088  	}
  8089  
  8090  	// Write out full state as well before proceeding.
  8091  	fs.writeFullState()
  8092  
  8093  	pr, pw := net.Pipe()
  8094  
  8095  	// Set a write deadline here to protect ourselves.
  8096  	if deadline > 0 {
  8097  		pw.SetWriteDeadline(time.Now().Add(deadline))
  8098  	}
  8099  
  8100  	// We can add to our stream while snapshotting but not "user" delete anything.
  8101  	var state StreamState
  8102  	fs.FastState(&state)
  8103  
  8104  	// Stream in separate Go routine.
  8105  	go fs.streamSnapshot(pw, includeConsumers)
  8106  
  8107  	return &SnapshotResult{pr, state}, nil
  8108  }
  8109  
  8110  // Helper to return the config.
  8111  func (fs *fileStore) fileStoreConfig() FileStoreConfig {
  8112  	fs.mu.RLock()
  8113  	defer fs.mu.RUnlock()
  8114  	return fs.fcfg
  8115  }
  8116  
  8117  // Read lock all existing message blocks.
  8118  // Lock held on entry.
  8119  func (fs *fileStore) readLockAllMsgBlocks() {
  8120  	for _, mb := range fs.blks {
  8121  		mb.mu.RLock()
  8122  	}
  8123  }
  8124  
  8125  // Read unlock all existing message blocks.
  8126  // Lock held on entry.
  8127  func (fs *fileStore) readUnlockAllMsgBlocks() {
  8128  	for _, mb := range fs.blks {
  8129  		mb.mu.RUnlock()
  8130  	}
  8131  }
  8132  
  8133  // Binary encoded state snapshot, >= v2.10 server.
  8134  func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) {
  8135  	fs.mu.RLock()
  8136  	defer fs.mu.RUnlock()
  8137  
  8138  	// Calculate deleted.
  8139  	var numDeleted int64
  8140  	if fs.state.LastSeq > fs.state.FirstSeq {
  8141  		numDeleted = int64(fs.state.LastSeq-fs.state.FirstSeq+1) - int64(fs.state.Msgs)
  8142  		if numDeleted < 0 {
  8143  			numDeleted = 0
  8144  		}
  8145  	}
  8146  
  8147  	// Encoded is Msgs, Bytes, FirstSeq, LastSeq, Failed, NumDeleted and optional DeletedBlocks
  8148  	var buf [1024]byte
  8149  	buf[0], buf[1] = streamStateMagic, streamStateVersion
  8150  	n := hdrLen
  8151  	n += binary.PutUvarint(buf[n:], fs.state.Msgs)
  8152  	n += binary.PutUvarint(buf[n:], fs.state.Bytes)
  8153  	n += binary.PutUvarint(buf[n:], fs.state.FirstSeq)
  8154  	n += binary.PutUvarint(buf[n:], fs.state.LastSeq)
  8155  	n += binary.PutUvarint(buf[n:], failed)
  8156  	n += binary.PutUvarint(buf[n:], uint64(numDeleted))
  8157  
  8158  	b := buf[0:n]
  8159  
  8160  	if numDeleted > 0 {
  8161  		var scratch [4 * 1024]byte
  8162  
  8163  		fs.readLockAllMsgBlocks()
  8164  		defer fs.readUnlockAllMsgBlocks()
  8165  
  8166  		for _, db := range fs.deleteBlocks() {
  8167  			switch db := db.(type) {
  8168  			case *DeleteRange:
  8169  				first, _, num := db.State()
  8170  				scratch[0] = runLengthMagic
  8171  				i := 1
  8172  				i += binary.PutUvarint(scratch[i:], first)
  8173  				i += binary.PutUvarint(scratch[i:], num)
  8174  				b = append(b, scratch[0:i]...)
  8175  			case *avl.SequenceSet:
  8176  				buf, err := db.Encode(scratch[:0])
  8177  				if err != nil {
  8178  					return nil, err
  8179  				}
  8180  				b = append(b, buf...)
  8181  			default:
  8182  				return nil, errors.New("no impl")
  8183  			}
  8184  		}
  8185  	}
  8186  
  8187  	return b, nil
  8188  }
  8189  
  8190  // We used to be more sophisticated to save memory, but speed is more important.
  8191  // All blocks should be at least read locked.
  8192  func (fs *fileStore) deleteBlocks() DeleteBlocks {
  8193  	var dbs DeleteBlocks
  8194  	var prevLast uint64
  8195  
  8196  	for _, mb := range fs.blks {
  8197  		// Detect if we have a gap between these blocks.
  8198  		fseq := atomic.LoadUint64(&mb.first.seq)
  8199  		if prevLast > 0 && prevLast+1 != fseq {
  8200  			dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: fseq - prevLast - 1})
  8201  		}
  8202  		if mb.dmap.Size() > 0 {
  8203  			dbs = append(dbs, &mb.dmap)
  8204  		}
  8205  		prevLast = atomic.LoadUint64(&mb.last.seq)
  8206  	}
  8207  	return dbs
  8208  }
  8209  
  8210  // SyncDeleted will make sure this stream has same deleted state as dbs.
  8211  func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) {
  8212  	if len(dbs) == 0 {
  8213  		return
  8214  	}
  8215  
  8216  	fs.mu.Lock()
  8217  	defer fs.mu.Unlock()
  8218  
  8219  	var needsCheck DeleteBlocks
  8220  
  8221  	fs.readLockAllMsgBlocks()
  8222  	mdbs := fs.deleteBlocks()
  8223  	for i, db := range dbs {
  8224  		// If the block is same as what we have we can skip.
  8225  		if i < len(mdbs) {
  8226  			first, last, num := db.State()
  8227  			eFirst, eLast, eNum := mdbs[i].State()
  8228  			if first == eFirst && last == eLast && num == eNum {
  8229  				continue
  8230  			}
  8231  		}
  8232  		// Need to insert these.
  8233  		needsCheck = append(needsCheck, db)
  8234  	}
  8235  	fs.readUnlockAllMsgBlocks()
  8236  
  8237  	for _, db := range needsCheck {
  8238  		db.Range(func(dseq uint64) bool {
  8239  			fs.removeMsg(dseq, false, true, false)
  8240  			return true
  8241  		})
  8242  	}
  8243  }
  8244  
  8245  ////////////////////////////////////////////////////////////////////////////////
  8246  // Consumers
  8247  ////////////////////////////////////////////////////////////////////////////////
  8248  
  8249  type consumerFileStore struct {
  8250  	mu      sync.Mutex
  8251  	fs      *fileStore
  8252  	cfg     *FileConsumerInfo
  8253  	prf     keyGen
  8254  	aek     cipher.AEAD
  8255  	name    string
  8256  	odir    string
  8257  	ifn     string
  8258  	hh      hash.Hash64
  8259  	state   ConsumerState
  8260  	fch     chan struct{}
  8261  	qch     chan struct{}
  8262  	flusher bool
  8263  	writing bool
  8264  	dirty   bool
  8265  	closed  bool
  8266  }
  8267  
  8268  func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) {
  8269  	if fs == nil {
  8270  		return nil, fmt.Errorf("filestore is nil")
  8271  	}
  8272  	if fs.isClosed() {
  8273  		return nil, ErrStoreClosed
  8274  	}
  8275  	if cfg == nil || name == _EMPTY_ {
  8276  		return nil, fmt.Errorf("bad consumer config")
  8277  	}
  8278  
  8279  	// We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store.
  8280  	if cfg.MemoryStorage {
  8281  		// Create directly here.
  8282  		o := &consumerMemStore{ms: fs, cfg: *cfg}
  8283  		fs.AddConsumer(o)
  8284  		return o, nil
  8285  	}
  8286  
  8287  	odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name)
  8288  	if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
  8289  		return nil, fmt.Errorf("could not create consumer directory - %v", err)
  8290  	}
  8291  	csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg}
  8292  	o := &consumerFileStore{
  8293  		fs:   fs,
  8294  		cfg:  csi,
  8295  		prf:  fs.prf,
  8296  		name: name,
  8297  		odir: odir,
  8298  		ifn:  filepath.Join(odir, consumerState),
  8299  	}
  8300  	key := sha256.Sum256([]byte(fs.cfg.Name + "/" + name))
  8301  	hh, err := highwayhash.New64(key[:])
  8302  	if err != nil {
  8303  		return nil, fmt.Errorf("could not create hash: %v", err)
  8304  	}
  8305  	o.hh = hh
  8306  
  8307  	// Check for encryption.
  8308  	if o.prf != nil {
  8309  		if ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)); err == nil {
  8310  			if len(ekey) < minBlkKeySize {
  8311  				return nil, errBadKeySize
  8312  			}
  8313  			// Recover key encryption key.
  8314  			rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
  8315  			if err != nil {
  8316  				return nil, err
  8317  			}
  8318  
  8319  			sc := fs.fcfg.Cipher
  8320  			kek, err := genEncryptionKey(sc, rb)
  8321  			if err != nil {
  8322  				return nil, err
  8323  			}
  8324  			ns := kek.NonceSize()
  8325  			nonce := ekey[:ns]
  8326  			seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
  8327  			if err != nil {
  8328  				// We may be here on a cipher conversion, so attempt to convert.
  8329  				if err = o.convertCipher(); err != nil {
  8330  					return nil, err
  8331  				}
  8332  			} else {
  8333  				o.aek, err = genEncryptionKey(sc, seed)
  8334  			}
  8335  			if err != nil {
  8336  				return nil, err
  8337  			}
  8338  		}
  8339  	}
  8340  
  8341  	// Track if we are creating the directory so that we can clean up if we encounter an error.
  8342  	var didCreate bool
  8343  
  8344  	// Write our meta data iff does not exist.
  8345  	meta := filepath.Join(odir, JetStreamMetaFile)
  8346  	if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) {
  8347  		didCreate = true
  8348  		csi.Created = time.Now().UTC()
  8349  		if err := o.writeConsumerMeta(); err != nil {
  8350  			os.RemoveAll(odir)
  8351  			return nil, err
  8352  		}
  8353  	}
  8354  
  8355  	// If we expect to be encrypted check that what we are restoring is not plaintext.
  8356  	// This can happen on snapshot restores or conversions.
  8357  	if o.prf != nil {
  8358  		keyFile := filepath.Join(odir, JetStreamMetaFileKey)
  8359  		if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
  8360  			if err := o.writeConsumerMeta(); err != nil {
  8361  				if didCreate {
  8362  					os.RemoveAll(odir)
  8363  				}
  8364  				return nil, err
  8365  			}
  8366  			// Redo the state file as well here if we have one and we can tell it was plaintext.
  8367  			if buf, err := os.ReadFile(o.ifn); err == nil {
  8368  				if _, err := decodeConsumerState(buf); err == nil {
  8369  					state, err := o.encryptState(buf)
  8370  					if err != nil {
  8371  						return nil, err
  8372  					}
  8373  					<-dios
  8374  					err = os.WriteFile(o.ifn, state, defaultFilePerms)
  8375  					dios <- struct{}{}
  8376  					if err != nil {
  8377  						if didCreate {
  8378  							os.RemoveAll(odir)
  8379  						}
  8380  						return nil, err
  8381  					}
  8382  				}
  8383  			}
  8384  		}
  8385  	}
  8386  
  8387  	// Create channels to control our flush go routine.
  8388  	o.fch = make(chan struct{}, 1)
  8389  	o.qch = make(chan struct{})
  8390  	go o.flushLoop(o.fch, o.qch)
  8391  
  8392  	// Make sure to load in our state from disk if needed.
  8393  	o.loadState()
  8394  
  8395  	// Assign to filestore.
  8396  	fs.AddConsumer(o)
  8397  
  8398  	return o, nil
  8399  }
  8400  
  8401  func (o *consumerFileStore) convertCipher() error {
  8402  	fs := o.fs
  8403  	odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, o.name)
  8404  
  8405  	ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey))
  8406  	if err != nil {
  8407  		return err
  8408  	}
  8409  	if len(ekey) < minBlkKeySize {
  8410  		return errBadKeySize
  8411  	}
  8412  	// Recover key encryption key.
  8413  	rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
  8414  	if err != nil {
  8415  		return err
  8416  	}
  8417  
  8418  	// Do these in reverse since converting.
  8419  	sc := fs.fcfg.Cipher
  8420  	osc := AES
  8421  	if sc == AES {
  8422  		osc = ChaCha
  8423  	}
  8424  	kek, err := genEncryptionKey(osc, rb)
  8425  	if err != nil {
  8426  		return err
  8427  	}
  8428  	ns := kek.NonceSize()
  8429  	nonce := ekey[:ns]
  8430  	seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
  8431  	if err != nil {
  8432  		return err
  8433  	}
  8434  	aek, err := genEncryptionKey(osc, seed)
  8435  	if err != nil {
  8436  		return err
  8437  	}
  8438  	// Now read in and decode our state using the old cipher.
  8439  	buf, err := os.ReadFile(o.ifn)
  8440  	if err != nil {
  8441  		return err
  8442  	}
  8443  	buf, err = aek.Open(nil, buf[:ns], buf[ns:], nil)
  8444  	if err != nil {
  8445  		return err
  8446  	}
  8447  
  8448  	// Since we are here we recovered our old state.
  8449  	// Now write our meta, which will generate the new keys with the new cipher.
  8450  	if err := o.writeConsumerMeta(); err != nil {
  8451  		return err
  8452  	}
  8453  
  8454  	// Now write out or state with the new cipher.
  8455  	return o.writeState(buf)
  8456  }
  8457  
  8458  // Kick flusher for this consumer.
  8459  // Lock should be held.
  8460  func (o *consumerFileStore) kickFlusher() {
  8461  	if o.fch != nil {
  8462  		select {
  8463  		case o.fch <- struct{}{}:
  8464  		default:
  8465  		}
  8466  	}
  8467  	o.dirty = true
  8468  }
  8469  
  8470  // Set in flusher status
  8471  func (o *consumerFileStore) setInFlusher() {
  8472  	o.mu.Lock()
  8473  	o.flusher = true
  8474  	o.mu.Unlock()
  8475  }
  8476  
  8477  // Clear in flusher status
  8478  func (o *consumerFileStore) clearInFlusher() {
  8479  	o.mu.Lock()
  8480  	o.flusher = false
  8481  	o.mu.Unlock()
  8482  }
  8483  
  8484  // Report in flusher status
  8485  func (o *consumerFileStore) inFlusher() bool {
  8486  	o.mu.Lock()
  8487  	defer o.mu.Unlock()
  8488  	return o.flusher
  8489  }
  8490  
  8491  // flushLoop watches for consumer updates and the quit channel.
  8492  func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) {
  8493  
  8494  	o.setInFlusher()
  8495  	defer o.clearInFlusher()
  8496  
  8497  	// Maintain approximately 10 updates per second per consumer under load.
  8498  	const minTime = 100 * time.Millisecond
  8499  	var lastWrite time.Time
  8500  	var dt *time.Timer
  8501  
  8502  	setDelayTimer := func(addWait time.Duration) {
  8503  		if dt == nil {
  8504  			dt = time.NewTimer(addWait)
  8505  			return
  8506  		}
  8507  		if !dt.Stop() {
  8508  			select {
  8509  			case <-dt.C:
  8510  			default:
  8511  			}
  8512  		}
  8513  		dt.Reset(addWait)
  8514  	}
  8515  
  8516  	for {
  8517  		select {
  8518  		case <-fch:
  8519  			if ts := time.Since(lastWrite); ts < minTime {
  8520  				setDelayTimer(minTime - ts)
  8521  				select {
  8522  				case <-dt.C:
  8523  				case <-qch:
  8524  					return
  8525  				}
  8526  			}
  8527  			o.mu.Lock()
  8528  			if o.closed {
  8529  				o.mu.Unlock()
  8530  				return
  8531  			}
  8532  			buf, err := o.encodeState()
  8533  			o.mu.Unlock()
  8534  			if err != nil {
  8535  				return
  8536  			}
  8537  			// TODO(dlc) - if we error should start failing upwards.
  8538  			if err := o.writeState(buf); err == nil {
  8539  				lastWrite = time.Now()
  8540  			}
  8541  		case <-qch:
  8542  			return
  8543  		}
  8544  	}
  8545  }
  8546  
  8547  // SetStarting sets our starting stream sequence.
  8548  func (o *consumerFileStore) SetStarting(sseq uint64) error {
  8549  	o.mu.Lock()
  8550  	o.state.Delivered.Stream = sseq
  8551  	buf, err := o.encodeState()
  8552  	o.mu.Unlock()
  8553  	if err != nil {
  8554  		return err
  8555  	}
  8556  	return o.writeState(buf)
  8557  }
  8558  
  8559  // HasState returns if this store has a recorded state.
  8560  func (o *consumerFileStore) HasState() bool {
  8561  	o.mu.Lock()
  8562  	_, err := os.Stat(o.ifn)
  8563  	o.mu.Unlock()
  8564  	return err == nil
  8565  }
  8566  
  8567  // UpdateDelivered is called whenever a new message has been delivered.
  8568  func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error {
  8569  	o.mu.Lock()
  8570  	defer o.mu.Unlock()
  8571  
  8572  	if dc != 1 && o.cfg.AckPolicy == AckNone {
  8573  		return ErrNoAckPolicy
  8574  	}
  8575  
  8576  	// On restarts the old leader may get a replay from the raft logs that are old.
  8577  	if dseq <= o.state.AckFloor.Consumer {
  8578  		return nil
  8579  	}
  8580  
  8581  	// See if we expect an ack for this.
  8582  	if o.cfg.AckPolicy != AckNone {
  8583  		// Need to create pending records here.
  8584  		if o.state.Pending == nil {
  8585  			o.state.Pending = make(map[uint64]*Pending)
  8586  		}
  8587  		var p *Pending
  8588  		// Check for an update to a message already delivered.
  8589  		if sseq <= o.state.Delivered.Stream {
  8590  			if p = o.state.Pending[sseq]; p != nil {
  8591  				p.Sequence, p.Timestamp = dseq, ts
  8592  			}
  8593  		} else {
  8594  			// Add to pending.
  8595  			o.state.Pending[sseq] = &Pending{dseq, ts}
  8596  		}
  8597  		// Update delivered as needed.
  8598  		if dseq > o.state.Delivered.Consumer {
  8599  			o.state.Delivered.Consumer = dseq
  8600  		}
  8601  		if sseq > o.state.Delivered.Stream {
  8602  			o.state.Delivered.Stream = sseq
  8603  		}
  8604  
  8605  		if dc > 1 {
  8606  			if maxdc := uint64(o.cfg.MaxDeliver); maxdc > 0 && dc > maxdc {
  8607  				// Make sure to remove from pending.
  8608  				delete(o.state.Pending, sseq)
  8609  			}
  8610  			if o.state.Redelivered == nil {
  8611  				o.state.Redelivered = make(map[uint64]uint64)
  8612  			}
  8613  			// Only update if greater then what we already have.
  8614  			if o.state.Redelivered[sseq] < dc-1 {
  8615  				o.state.Redelivered[sseq] = dc - 1
  8616  			}
  8617  		}
  8618  	} else {
  8619  		// For AckNone just update delivered and ackfloor at the same time.
  8620  		if dseq > o.state.Delivered.Consumer {
  8621  			o.state.Delivered.Consumer = dseq
  8622  			o.state.AckFloor.Consumer = dseq
  8623  		}
  8624  		if sseq > o.state.Delivered.Stream {
  8625  			o.state.Delivered.Stream = sseq
  8626  			o.state.AckFloor.Stream = sseq
  8627  		}
  8628  	}
  8629  	// Make sure we flush to disk.
  8630  	o.kickFlusher()
  8631  
  8632  	return nil
  8633  }
  8634  
  8635  // UpdateAcks is called whenever a consumer with explicit ack or ack all acks a message.
  8636  func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
  8637  	o.mu.Lock()
  8638  	defer o.mu.Unlock()
  8639  
  8640  	if o.cfg.AckPolicy == AckNone {
  8641  		return ErrNoAckPolicy
  8642  	}
  8643  
  8644  	// On restarts the old leader may get a replay from the raft logs that are old.
  8645  	if dseq <= o.state.AckFloor.Consumer {
  8646  		return nil
  8647  	}
  8648  
  8649  	if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
  8650  		return ErrStoreMsgNotFound
  8651  	}
  8652  
  8653  	// Check for AckAll here.
  8654  	if o.cfg.AckPolicy == AckAll {
  8655  		sgap := sseq - o.state.AckFloor.Stream
  8656  		o.state.AckFloor.Consumer = dseq
  8657  		o.state.AckFloor.Stream = sseq
  8658  		for seq := sseq; seq > sseq-sgap; seq-- {
  8659  			delete(o.state.Pending, seq)
  8660  			if len(o.state.Redelivered) > 0 {
  8661  				delete(o.state.Redelivered, seq)
  8662  			}
  8663  		}
  8664  		o.kickFlusher()
  8665  		return nil
  8666  	}
  8667  
  8668  	// AckExplicit
  8669  
  8670  	// First delete from our pending state.
  8671  	if p, ok := o.state.Pending[sseq]; ok {
  8672  		delete(o.state.Pending, sseq)
  8673  		dseq = p.Sequence // Use the original.
  8674  	}
  8675  	if len(o.state.Pending) == 0 {
  8676  		o.state.AckFloor.Consumer = o.state.Delivered.Consumer
  8677  		o.state.AckFloor.Stream = o.state.Delivered.Stream
  8678  	} else if dseq == o.state.AckFloor.Consumer+1 {
  8679  		o.state.AckFloor.Consumer = dseq
  8680  		o.state.AckFloor.Stream = sseq
  8681  
  8682  		if o.state.Delivered.Consumer > dseq {
  8683  			for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ {
  8684  				if p, ok := o.state.Pending[ss]; ok {
  8685  					if p.Sequence > 0 {
  8686  						o.state.AckFloor.Consumer = p.Sequence - 1
  8687  						o.state.AckFloor.Stream = ss - 1
  8688  					}
  8689  					break
  8690  				}
  8691  			}
  8692  		}
  8693  	}
  8694  	// We do these regardless.
  8695  	delete(o.state.Redelivered, sseq)
  8696  
  8697  	o.kickFlusher()
  8698  	return nil
  8699  }
  8700  
  8701  const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen
  8702  
  8703  // Encode our consumer state, version 2.
  8704  // Lock should be held.
  8705  
  8706  func (o *consumerFileStore) EncodedState() ([]byte, error) {
  8707  	o.mu.Lock()
  8708  	defer o.mu.Unlock()
  8709  	return o.encodeState()
  8710  }
  8711  
  8712  func (o *consumerFileStore) encodeState() ([]byte, error) {
  8713  	// Grab reference to state, but make sure we load in if needed, so do not reference o.state directly.
  8714  	state, err := o.stateWithCopyLocked(false)
  8715  	if err != nil {
  8716  		return nil, err
  8717  	}
  8718  	return encodeConsumerState(state), nil
  8719  }
  8720  
  8721  func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error {
  8722  	o.mu.Lock()
  8723  	defer o.mu.Unlock()
  8724  
  8725  	// This is mostly unchecked here. We are assuming the upper layers have done sanity checking.
  8726  	csi := o.cfg
  8727  	csi.ConsumerConfig = *cfg
  8728  
  8729  	return o.writeConsumerMeta()
  8730  }
  8731  
  8732  func (o *consumerFileStore) Update(state *ConsumerState) error {
  8733  	o.mu.Lock()
  8734  	defer o.mu.Unlock()
  8735  
  8736  	// Check to see if this is an outdated update.
  8737  	if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream {
  8738  		return nil
  8739  	}
  8740  
  8741  	// Sanity checks.
  8742  	if state.AckFloor.Consumer > state.Delivered.Consumer {
  8743  		return fmt.Errorf("bad ack floor for consumer")
  8744  	}
  8745  	if state.AckFloor.Stream > state.Delivered.Stream {
  8746  		return fmt.Errorf("bad ack floor for stream")
  8747  	}
  8748  
  8749  	// Copy to our state.
  8750  	var pending map[uint64]*Pending
  8751  	var redelivered map[uint64]uint64
  8752  	if len(state.Pending) > 0 {
  8753  		pending = make(map[uint64]*Pending, len(state.Pending))
  8754  		for seq, p := range state.Pending {
  8755  			pending[seq] = &Pending{p.Sequence, p.Timestamp}
  8756  			if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream {
  8757  				return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq)
  8758  			}
  8759  		}
  8760  	}
  8761  	if len(state.Redelivered) > 0 {
  8762  		redelivered = make(map[uint64]uint64, len(state.Redelivered))
  8763  		for seq, dc := range state.Redelivered {
  8764  			redelivered[seq] = dc
  8765  		}
  8766  	}
  8767  
  8768  	o.state.Delivered = state.Delivered
  8769  	o.state.AckFloor = state.AckFloor
  8770  	o.state.Pending = pending
  8771  	o.state.Redelivered = redelivered
  8772  
  8773  	o.kickFlusher()
  8774  
  8775  	return nil
  8776  }
  8777  
  8778  // Will encrypt the state with our asset key. Will be a no-op if encryption not enabled.
  8779  // Lock should be held.
  8780  func (o *consumerFileStore) encryptState(buf []byte) ([]byte, error) {
  8781  	if o.aek == nil {
  8782  		return buf, nil
  8783  	}
  8784  	// TODO(dlc) - Optimize on space usage a bit?
  8785  	nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead())
  8786  	if n, err := rand.Read(nonce); err != nil {
  8787  		return nil, err
  8788  	} else if n != len(nonce) {
  8789  		return nil, fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce))
  8790  	}
  8791  	return o.aek.Seal(nonce, nonce, buf, nil), nil
  8792  }
  8793  
  8794  // Used to limit number of disk IO calls in flight since they could all be blocking an OS thread.
  8795  // https://github.com/nats-io/nats-server/issues/2742
  8796  var dios chan struct{}
  8797  
  8798  // Used to setup our simplistic counting semaphore using buffered channels.
  8799  // golang.org's semaphore seemed a bit heavy.
  8800  func init() {
  8801  	// Limit ourselves to a max of 4 blocking IO calls.
  8802  	const nIO = 4
  8803  	dios = make(chan struct{}, nIO)
  8804  	// Fill it up to start.
  8805  	for i := 0; i < nIO; i++ {
  8806  		dios <- struct{}{}
  8807  	}
  8808  }
  8809  
  8810  func (o *consumerFileStore) writeState(buf []byte) error {
  8811  	// Check if we have the index file open.
  8812  	o.mu.Lock()
  8813  	if o.writing || len(buf) == 0 {
  8814  		o.mu.Unlock()
  8815  		return nil
  8816  	}
  8817  
  8818  	// Check on encryption.
  8819  	if o.aek != nil {
  8820  		var err error
  8821  		if buf, err = o.encryptState(buf); err != nil {
  8822  			return err
  8823  		}
  8824  	}
  8825  
  8826  	o.writing = true
  8827  	o.dirty = false
  8828  	ifn := o.ifn
  8829  	o.mu.Unlock()
  8830  
  8831  	// Lock not held here but we do limit number of outstanding calls that could block OS threads.
  8832  	<-dios
  8833  	err := os.WriteFile(ifn, buf, defaultFilePerms)
  8834  	dios <- struct{}{}
  8835  
  8836  	o.mu.Lock()
  8837  	if err != nil {
  8838  		o.dirty = true
  8839  	}
  8840  	o.writing = false
  8841  	o.mu.Unlock()
  8842  
  8843  	return err
  8844  }
  8845  
  8846  // Will upodate the config. Only used when recovering ephemerals.
  8847  func (o *consumerFileStore) updateConfig(cfg ConsumerConfig) error {
  8848  	o.mu.Lock()
  8849  	defer o.mu.Unlock()
  8850  	o.cfg = &FileConsumerInfo{ConsumerConfig: cfg}
  8851  	return o.writeConsumerMeta()
  8852  }
  8853  
  8854  // Write out the consumer meta data, i.e. state.
  8855  // Lock should be held.
  8856  func (cfs *consumerFileStore) writeConsumerMeta() error {
  8857  	meta := filepath.Join(cfs.odir, JetStreamMetaFile)
  8858  	if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
  8859  		return err
  8860  	}
  8861  
  8862  	if cfs.prf != nil && cfs.aek == nil {
  8863  		fs := cfs.fs
  8864  		key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name + tsep + cfs.name)
  8865  		if err != nil {
  8866  			return err
  8867  		}
  8868  		cfs.aek = key
  8869  		keyFile := filepath.Join(cfs.odir, JetStreamMetaFileKey)
  8870  		if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
  8871  			return err
  8872  		}
  8873  		<-dios
  8874  		err = os.WriteFile(keyFile, encrypted, defaultFilePerms)
  8875  		dios <- struct{}{}
  8876  		if err != nil {
  8877  			return err
  8878  		}
  8879  	}
  8880  
  8881  	b, err := json.Marshal(cfs.cfg)
  8882  	if err != nil {
  8883  		return err
  8884  	}
  8885  	// Encrypt if needed.
  8886  	if cfs.aek != nil {
  8887  		nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead())
  8888  		if n, err := rand.Read(nonce); err != nil {
  8889  			return err
  8890  		} else if n != len(nonce) {
  8891  			return fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce))
  8892  		}
  8893  		b = cfs.aek.Seal(nonce, nonce, b, nil)
  8894  	}
  8895  
  8896  	<-dios
  8897  	err = os.WriteFile(meta, b, defaultFilePerms)
  8898  	dios <- struct{}{}
  8899  	if err != nil {
  8900  		return err
  8901  	}
  8902  	cfs.hh.Reset()
  8903  	cfs.hh.Write(b)
  8904  	checksum := hex.EncodeToString(cfs.hh.Sum(nil))
  8905  	sum := filepath.Join(cfs.odir, JetStreamMetaFileSum)
  8906  
  8907  	<-dios
  8908  	err = os.WriteFile(sum, []byte(checksum), defaultFilePerms)
  8909  	dios <- struct{}{}
  8910  	if err != nil {
  8911  		return err
  8912  	}
  8913  	return nil
  8914  }
  8915  
  8916  // Consumer version.
  8917  func checkConsumerHeader(hdr []byte) (uint8, error) {
  8918  	if hdr == nil || len(hdr) < 2 || hdr[0] != magic {
  8919  		return 0, errCorruptState
  8920  	}
  8921  	version := hdr[1]
  8922  	switch version {
  8923  	case 1, 2:
  8924  		return version, nil
  8925  	}
  8926  	return 0, fmt.Errorf("unsupported version: %d", version)
  8927  }
  8928  
  8929  func (o *consumerFileStore) copyPending() map[uint64]*Pending {
  8930  	pending := make(map[uint64]*Pending, len(o.state.Pending))
  8931  	for seq, p := range o.state.Pending {
  8932  		pending[seq] = &Pending{p.Sequence, p.Timestamp}
  8933  	}
  8934  	return pending
  8935  }
  8936  
  8937  func (o *consumerFileStore) copyRedelivered() map[uint64]uint64 {
  8938  	redelivered := make(map[uint64]uint64, len(o.state.Redelivered))
  8939  	for seq, dc := range o.state.Redelivered {
  8940  		redelivered[seq] = dc
  8941  	}
  8942  	return redelivered
  8943  }
  8944  
  8945  // Type returns the type of the underlying store.
  8946  func (o *consumerFileStore) Type() StorageType { return FileStorage }
  8947  
  8948  // State retrieves the state from the state file.
  8949  // This is not expected to be called in high performance code, only on startup.
  8950  func (o *consumerFileStore) State() (*ConsumerState, error) {
  8951  	return o.stateWithCopy(true)
  8952  }
  8953  
  8954  // This will not copy pending or redelivered, so should only be done under the
  8955  // consumer owner's lock.
  8956  func (o *consumerFileStore) BorrowState() (*ConsumerState, error) {
  8957  	return o.stateWithCopy(false)
  8958  }
  8959  
  8960  func (o *consumerFileStore) stateWithCopy(doCopy bool) (*ConsumerState, error) {
  8961  	o.mu.Lock()
  8962  	defer o.mu.Unlock()
  8963  	return o.stateWithCopyLocked(doCopy)
  8964  }
  8965  
  8966  // Lock should be held.
  8967  func (o *consumerFileStore) stateWithCopyLocked(doCopy bool) (*ConsumerState, error) {
  8968  	if o.closed {
  8969  		return nil, ErrStoreClosed
  8970  	}
  8971  
  8972  	state := &ConsumerState{}
  8973  
  8974  	// See if we have a running state or if we need to read in from disk.
  8975  	if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 {
  8976  		state.Delivered = o.state.Delivered
  8977  		state.AckFloor = o.state.AckFloor
  8978  		if len(o.state.Pending) > 0 {
  8979  			if doCopy {
  8980  				state.Pending = o.copyPending()
  8981  			} else {
  8982  				state.Pending = o.state.Pending
  8983  			}
  8984  		}
  8985  		if len(o.state.Redelivered) > 0 {
  8986  			if doCopy {
  8987  				state.Redelivered = o.copyRedelivered()
  8988  			} else {
  8989  				state.Redelivered = o.state.Redelivered
  8990  			}
  8991  		}
  8992  		return state, nil
  8993  	}
  8994  
  8995  	// Read the state in here from disk..
  8996  	<-dios
  8997  	buf, err := os.ReadFile(o.ifn)
  8998  	dios <- struct{}{}
  8999  
  9000  	if err != nil && !os.IsNotExist(err) {
  9001  		return nil, err
  9002  	}
  9003  
  9004  	if len(buf) == 0 {
  9005  		return state, nil
  9006  	}
  9007  
  9008  	// Check on encryption.
  9009  	if o.aek != nil {
  9010  		ns := o.aek.NonceSize()
  9011  		buf, err = o.aek.Open(nil, buf[:ns], buf[ns:], nil)
  9012  		if err != nil {
  9013  			return nil, err
  9014  		}
  9015  	}
  9016  
  9017  	state, err = decodeConsumerState(buf)
  9018  	if err != nil {
  9019  		return nil, err
  9020  	}
  9021  
  9022  	// Copy this state into our own.
  9023  	o.state.Delivered = state.Delivered
  9024  	o.state.AckFloor = state.AckFloor
  9025  	if len(state.Pending) > 0 {
  9026  		if doCopy {
  9027  			o.state.Pending = make(map[uint64]*Pending, len(state.Pending))
  9028  			for seq, p := range state.Pending {
  9029  				o.state.Pending[seq] = &Pending{p.Sequence, p.Timestamp}
  9030  			}
  9031  		} else {
  9032  			o.state.Pending = state.Pending
  9033  		}
  9034  	}
  9035  	if len(state.Redelivered) > 0 {
  9036  		if doCopy {
  9037  			o.state.Redelivered = make(map[uint64]uint64, len(state.Redelivered))
  9038  			for seq, dc := range state.Redelivered {
  9039  				o.state.Redelivered[seq] = dc
  9040  			}
  9041  		} else {
  9042  			o.state.Redelivered = state.Redelivered
  9043  		}
  9044  	}
  9045  
  9046  	return state, nil
  9047  }
  9048  
  9049  // Lock should be held. Called at startup.
  9050  func (o *consumerFileStore) loadState() {
  9051  	if _, err := os.Stat(o.ifn); err == nil {
  9052  		// This will load our state in from disk.
  9053  		o.stateWithCopyLocked(false)
  9054  	}
  9055  }
  9056  
  9057  // Decode consumer state.
  9058  func decodeConsumerState(buf []byte) (*ConsumerState, error) {
  9059  	version, err := checkConsumerHeader(buf)
  9060  	if err != nil {
  9061  		return nil, err
  9062  	}
  9063  
  9064  	bi := hdrLen
  9065  	// Helpers, will set i to -1 on error.
  9066  	readSeq := func() uint64 {
  9067  		if bi < 0 {
  9068  			return 0
  9069  		}
  9070  		seq, n := binary.Uvarint(buf[bi:])
  9071  		if n <= 0 {
  9072  			bi = -1
  9073  			return 0
  9074  		}
  9075  		bi += n
  9076  		return seq
  9077  	}
  9078  	readTimeStamp := func() int64 {
  9079  		if bi < 0 {
  9080  			return 0
  9081  		}
  9082  		ts, n := binary.Varint(buf[bi:])
  9083  		if n <= 0 {
  9084  			bi = -1
  9085  			return -1
  9086  		}
  9087  		bi += n
  9088  		return ts
  9089  	}
  9090  	// Just for clarity below.
  9091  	readLen := readSeq
  9092  	readCount := readSeq
  9093  
  9094  	state := &ConsumerState{}
  9095  	state.AckFloor.Consumer = readSeq()
  9096  	state.AckFloor.Stream = readSeq()
  9097  	state.Delivered.Consumer = readSeq()
  9098  	state.Delivered.Stream = readSeq()
  9099  
  9100  	if bi == -1 {
  9101  		return nil, errCorruptState
  9102  	}
  9103  	if version == 1 {
  9104  		// Adjust back. Version 1 also stored delivered as next to be delivered,
  9105  		// so adjust that back down here.
  9106  		if state.AckFloor.Consumer > 1 {
  9107  			state.Delivered.Consumer += state.AckFloor.Consumer - 1
  9108  		}
  9109  		if state.AckFloor.Stream > 1 {
  9110  			state.Delivered.Stream += state.AckFloor.Stream - 1
  9111  		}
  9112  	}
  9113  
  9114  	// Protect ourselves against rolling backwards.
  9115  	const hbit = 1 << 63
  9116  	if state.AckFloor.Stream&hbit != 0 || state.Delivered.Stream&hbit != 0 {
  9117  		return nil, errCorruptState
  9118  	}
  9119  
  9120  	// We have additional stuff.
  9121  	if numPending := readLen(); numPending > 0 {
  9122  		mints := readTimeStamp()
  9123  		state.Pending = make(map[uint64]*Pending, numPending)
  9124  		for i := 0; i < int(numPending); i++ {
  9125  			sseq := readSeq()
  9126  			var dseq uint64
  9127  			if version == 2 {
  9128  				dseq = readSeq()
  9129  			}
  9130  			ts := readTimeStamp()
  9131  			// Check the state machine for corruption, not the value which could be -1.
  9132  			if bi == -1 {
  9133  				return nil, errCorruptState
  9134  			}
  9135  			// Adjust seq back.
  9136  			sseq += state.AckFloor.Stream
  9137  			if sseq == 0 {
  9138  				return nil, errCorruptState
  9139  			}
  9140  			if version == 2 {
  9141  				dseq += state.AckFloor.Consumer
  9142  			}
  9143  			// Adjust the timestamp back.
  9144  			if version == 1 {
  9145  				ts = (ts + mints) * int64(time.Second)
  9146  			} else {
  9147  				ts = (mints - ts) * int64(time.Second)
  9148  			}
  9149  			// Store in pending.
  9150  			state.Pending[sseq] = &Pending{dseq, ts}
  9151  		}
  9152  	}
  9153  
  9154  	// We have redelivered entries here.
  9155  	if numRedelivered := readLen(); numRedelivered > 0 {
  9156  		state.Redelivered = make(map[uint64]uint64, numRedelivered)
  9157  		for i := 0; i < int(numRedelivered); i++ {
  9158  			if seq, n := readSeq(), readCount(); seq > 0 && n > 0 {
  9159  				// Adjust seq back.
  9160  				seq += state.AckFloor.Stream
  9161  				state.Redelivered[seq] = n
  9162  			}
  9163  		}
  9164  	}
  9165  
  9166  	return state, nil
  9167  }
  9168  
  9169  // Stop the processing of the consumers's state.
  9170  func (o *consumerFileStore) Stop() error {
  9171  	o.mu.Lock()
  9172  	if o.closed {
  9173  		o.mu.Unlock()
  9174  		return nil
  9175  	}
  9176  	if o.qch != nil {
  9177  		close(o.qch)
  9178  		o.qch = nil
  9179  	}
  9180  
  9181  	var err error
  9182  	var buf []byte
  9183  
  9184  	if o.dirty {
  9185  		// Make sure to write this out..
  9186  		if buf, err = o.encodeState(); err == nil && len(buf) > 0 {
  9187  			if o.aek != nil {
  9188  				if buf, err = o.encryptState(buf); err != nil {
  9189  					return err
  9190  				}
  9191  			}
  9192  		}
  9193  	}
  9194  
  9195  	o.odir = _EMPTY_
  9196  	o.closed = true
  9197  	ifn, fs := o.ifn, o.fs
  9198  	o.mu.Unlock()
  9199  
  9200  	fs.RemoveConsumer(o)
  9201  
  9202  	if len(buf) > 0 {
  9203  		o.waitOnFlusher()
  9204  		<-dios
  9205  		err = os.WriteFile(ifn, buf, defaultFilePerms)
  9206  		dios <- struct{}{}
  9207  	}
  9208  	return err
  9209  }
  9210  
  9211  func (o *consumerFileStore) waitOnFlusher() {
  9212  	if !o.inFlusher() {
  9213  		return
  9214  	}
  9215  
  9216  	timeout := time.Now().Add(100 * time.Millisecond)
  9217  	for time.Now().Before(timeout) {
  9218  		if !o.inFlusher() {
  9219  			return
  9220  		}
  9221  		time.Sleep(10 * time.Millisecond)
  9222  	}
  9223  }
  9224  
  9225  // Delete the consumer.
  9226  func (o *consumerFileStore) Delete() error {
  9227  	return o.delete(false)
  9228  }
  9229  
  9230  func (o *consumerFileStore) StreamDelete() error {
  9231  	return o.delete(true)
  9232  }
  9233  
  9234  func (o *consumerFileStore) delete(streamDeleted bool) error {
  9235  	o.mu.Lock()
  9236  	if o.closed {
  9237  		o.mu.Unlock()
  9238  		return nil
  9239  	}
  9240  	if o.qch != nil {
  9241  		close(o.qch)
  9242  		o.qch = nil
  9243  	}
  9244  
  9245  	var err error
  9246  	odir := o.odir
  9247  	o.odir = _EMPTY_
  9248  	o.closed = true
  9249  	fs := o.fs
  9250  	o.mu.Unlock()
  9251  
  9252  	// If our stream was not deleted this will remove the directories.
  9253  	if odir != _EMPTY_ && !streamDeleted {
  9254  		<-dios
  9255  		err = os.RemoveAll(odir)
  9256  		dios <- struct{}{}
  9257  	}
  9258  
  9259  	if !streamDeleted {
  9260  		fs.RemoveConsumer(o)
  9261  	}
  9262  
  9263  	return err
  9264  }
  9265  
  9266  func (fs *fileStore) AddConsumer(o ConsumerStore) error {
  9267  	fs.cmu.Lock()
  9268  	defer fs.cmu.Unlock()
  9269  	fs.cfs = append(fs.cfs, o)
  9270  	return nil
  9271  }
  9272  
  9273  func (fs *fileStore) RemoveConsumer(o ConsumerStore) error {
  9274  	fs.cmu.Lock()
  9275  	defer fs.cmu.Unlock()
  9276  	for i, cfs := range fs.cfs {
  9277  		if o == cfs {
  9278  			fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...)
  9279  			break
  9280  		}
  9281  	}
  9282  	return nil
  9283  }
  9284  
  9285  ////////////////////////////////////////////////////////////////////////////////
  9286  // Templates
  9287  ////////////////////////////////////////////////////////////////////////////////
  9288  
  9289  type templateFileStore struct {
  9290  	dir string
  9291  	hh  hash.Hash64
  9292  }
  9293  
  9294  func newTemplateFileStore(storeDir string) *templateFileStore {
  9295  	tdir := filepath.Join(storeDir, tmplsDir)
  9296  	key := sha256.Sum256([]byte("templates"))
  9297  	hh, err := highwayhash.New64(key[:])
  9298  	if err != nil {
  9299  		return nil
  9300  	}
  9301  	return &templateFileStore{dir: tdir, hh: hh}
  9302  }
  9303  
  9304  func (ts *templateFileStore) Store(t *streamTemplate) error {
  9305  	dir := filepath.Join(ts.dir, t.Name)
  9306  	if err := os.MkdirAll(dir, defaultDirPerms); err != nil {
  9307  		return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err)
  9308  	}
  9309  	meta := filepath.Join(dir, JetStreamMetaFile)
  9310  	if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil {
  9311  		return err
  9312  	}
  9313  	t.mu.Lock()
  9314  	b, err := json.Marshal(t)
  9315  	t.mu.Unlock()
  9316  	if err != nil {
  9317  		return err
  9318  	}
  9319  	if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
  9320  		return err
  9321  	}
  9322  	// FIXME(dlc) - Do checksum
  9323  	ts.hh.Reset()
  9324  	ts.hh.Write(b)
  9325  	checksum := hex.EncodeToString(ts.hh.Sum(nil))
  9326  	sum := filepath.Join(dir, JetStreamMetaFileSum)
  9327  	if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
  9328  		return err
  9329  	}
  9330  	return nil
  9331  }
  9332  
  9333  func (ts *templateFileStore) Delete(t *streamTemplate) error {
  9334  	return os.RemoveAll(filepath.Join(ts.dir, t.Name))
  9335  }
  9336  
  9337  ////////////////////////////////////////////////////////////////////////////////
  9338  // Compression
  9339  ////////////////////////////////////////////////////////////////////////////////
  9340  
  9341  type CompressionInfo struct {
  9342  	Algorithm    StoreCompression
  9343  	OriginalSize uint64
  9344  }
  9345  
  9346  func (c *CompressionInfo) MarshalMetadata() []byte {
  9347  	b := make([]byte, 14) // 4 + potentially up to 10 for uint64
  9348  	b[0], b[1], b[2] = 'c', 'm', 'p'
  9349  	b[3] = byte(c.Algorithm)
  9350  	n := binary.PutUvarint(b[4:], c.OriginalSize)
  9351  	return b[:4+n]
  9352  }
  9353  
  9354  func (c *CompressionInfo) UnmarshalMetadata(b []byte) (int, error) {
  9355  	c.Algorithm = NoCompression
  9356  	c.OriginalSize = 0
  9357  	if len(b) < 5 { // 4 + min 1 for uvarint uint64
  9358  		return 0, nil
  9359  	}
  9360  	if b[0] != 'c' || b[1] != 'm' || b[2] != 'p' {
  9361  		return 0, nil
  9362  	}
  9363  	var n int
  9364  	c.Algorithm = StoreCompression(b[3])
  9365  	c.OriginalSize, n = binary.Uvarint(b[4:])
  9366  	if n <= 0 {
  9367  		return 0, fmt.Errorf("metadata incomplete")
  9368  	}
  9369  	return 4 + n, nil
  9370  }
  9371  
  9372  func (alg StoreCompression) Compress(buf []byte) ([]byte, error) {
  9373  	if len(buf) < checksumSize {
  9374  		return nil, fmt.Errorf("uncompressed buffer is too short")
  9375  	}
  9376  	bodyLen := int64(len(buf) - checksumSize)
  9377  	var output bytes.Buffer
  9378  	var writer io.WriteCloser
  9379  	switch alg {
  9380  	case NoCompression:
  9381  		return buf, nil
  9382  	case S2Compression:
  9383  		writer = s2.NewWriter(&output)
  9384  	default:
  9385  		return nil, fmt.Errorf("compression algorithm not known")
  9386  	}
  9387  
  9388  	input := bytes.NewReader(buf[:bodyLen])
  9389  	checksum := buf[bodyLen:]
  9390  
  9391  	// Compress the block content, but don't compress the checksum.
  9392  	// We will preserve it at the end of the block as-is.
  9393  	if n, err := io.CopyN(writer, input, bodyLen); err != nil {
  9394  		return nil, fmt.Errorf("error writing to compression writer: %w", err)
  9395  	} else if n != bodyLen {
  9396  		return nil, fmt.Errorf("short write on body (%d != %d)", n, bodyLen)
  9397  	}
  9398  	if err := writer.Close(); err != nil {
  9399  		return nil, fmt.Errorf("error closing compression writer: %w", err)
  9400  	}
  9401  
  9402  	// Now add the checksum back onto the end of the block.
  9403  	if n, err := output.Write(checksum); err != nil {
  9404  		return nil, fmt.Errorf("error writing checksum: %w", err)
  9405  	} else if n != checksumSize {
  9406  		return nil, fmt.Errorf("short write on checksum (%d != %d)", n, checksumSize)
  9407  	}
  9408  
  9409  	return output.Bytes(), nil
  9410  }
  9411  
  9412  func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) {
  9413  	if len(buf) < checksumSize {
  9414  		return nil, fmt.Errorf("compressed buffer is too short")
  9415  	}
  9416  	bodyLen := int64(len(buf) - checksumSize)
  9417  	input := bytes.NewReader(buf[:bodyLen])
  9418  
  9419  	var reader io.ReadCloser
  9420  	switch alg {
  9421  	case NoCompression:
  9422  		return buf, nil
  9423  	case S2Compression:
  9424  		reader = io.NopCloser(s2.NewReader(input))
  9425  	default:
  9426  		return nil, fmt.Errorf("compression algorithm not known")
  9427  	}
  9428  
  9429  	// Decompress the block content. The checksum isn't compressed so
  9430  	// we can preserve it from the end of the block as-is.
  9431  	checksum := buf[bodyLen:]
  9432  	output, err := io.ReadAll(reader)
  9433  	if err != nil {
  9434  		return nil, fmt.Errorf("error reading compression reader: %w", err)
  9435  	}
  9436  	output = append(output, checksum...)
  9437  
  9438  	return output, reader.Close()
  9439  }