get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/server/filestore.go (about)

     1  // Copyright 2019-2024 The NATS Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package server
    15  
    16  import (
    17  	"archive/tar"
    18  	"bytes"
    19  	"crypto/aes"
    20  	"crypto/cipher"
    21  	"crypto/rand"
    22  	"crypto/sha256"
    23  	"encoding/binary"
    24  	"encoding/hex"
    25  	"encoding/json"
    26  	"errors"
    27  	"fmt"
    28  	"hash"
    29  	"io"
    30  	"math"
    31  	"net"
    32  	"os"
    33  	"path/filepath"
    34  	"sort"
    35  	"strings"
    36  	"sync"
    37  	"sync/atomic"
    38  	"time"
    39  
    40  	"get.pme.sh/pnats/server/avl"
    41  	"get.pme.sh/pnats/server/stree"
    42  	"github.com/klauspost/compress/s2"
    43  	"github.com/minio/highwayhash"
    44  	"golang.org/x/crypto/chacha20"
    45  	"golang.org/x/crypto/chacha20poly1305"
    46  )
    47  
    48  type FileStoreConfig struct {
    49  	// Where the parent directory for all storage will be located.
    50  	StoreDir string
    51  	// BlockSize is the file block size. This also represents the maximum overhead size.
    52  	BlockSize uint64
    53  	// CacheExpire is how long with no activity until we expire the cache.
    54  	CacheExpire time.Duration
    55  	// SyncInterval is how often we sync to disk in the background.
    56  	SyncInterval time.Duration
    57  	// SyncAlways is when the stream should sync all data writes.
    58  	SyncAlways bool
    59  	// AsyncFlush allows async flush to batch write operations.
    60  	AsyncFlush bool
    61  	// Cipher is the cipher to use when encrypting.
    62  	Cipher StoreCipher
    63  	// Compression is the algorithm to use when compressing.
    64  	Compression StoreCompression
    65  
    66  	// Internal reference to our server.
    67  	srv *Server
    68  }
    69  
    70  // FileStreamInfo allows us to remember created time.
    71  type FileStreamInfo struct {
    72  	Created time.Time
    73  	StreamConfig
    74  }
    75  
    76  type StoreCipher int
    77  
    78  const (
    79  	ChaCha StoreCipher = iota
    80  	AES
    81  	NoCipher
    82  )
    83  
    84  func (cipher StoreCipher) String() string {
    85  	switch cipher {
    86  	case ChaCha:
    87  		return "ChaCha20-Poly1305"
    88  	case AES:
    89  		return "AES-GCM"
    90  	case NoCipher:
    91  		return "None"
    92  	default:
    93  		return "Unknown StoreCipher"
    94  	}
    95  }
    96  
    97  type StoreCompression uint8
    98  
    99  const (
   100  	NoCompression StoreCompression = iota
   101  	S2Compression
   102  )
   103  
   104  func (alg StoreCompression) String() string {
   105  	switch alg {
   106  	case NoCompression:
   107  		return "None"
   108  	case S2Compression:
   109  		return "S2"
   110  	default:
   111  		return "Unknown StoreCompression"
   112  	}
   113  }
   114  
   115  func (alg StoreCompression) MarshalJSON() ([]byte, error) {
   116  	var str string
   117  	switch alg {
   118  	case S2Compression:
   119  		str = "s2"
   120  	case NoCompression:
   121  		str = "none"
   122  	default:
   123  		return nil, fmt.Errorf("unknown compression algorithm")
   124  	}
   125  	return json.Marshal(str)
   126  }
   127  
   128  func (alg *StoreCompression) UnmarshalJSON(b []byte) error {
   129  	var str string
   130  	if err := json.Unmarshal(b, &str); err != nil {
   131  		return err
   132  	}
   133  	switch str {
   134  	case "s2":
   135  		*alg = S2Compression
   136  	case "none":
   137  		*alg = NoCompression
   138  	default:
   139  		return fmt.Errorf("unknown compression algorithm")
   140  	}
   141  	return nil
   142  }
   143  
   144  // File ConsumerInfo is used for creating consumer stores.
   145  type FileConsumerInfo struct {
   146  	Created time.Time
   147  	Name    string
   148  	ConsumerConfig
   149  }
   150  
   151  // Default file and directory permissions.
   152  const (
   153  	defaultDirPerms  = os.FileMode(0750)
   154  	defaultFilePerms = os.FileMode(0640)
   155  )
   156  
   157  type psi struct {
   158  	total uint64
   159  	fblk  uint32
   160  	lblk  uint32
   161  }
   162  
   163  type fileStore struct {
   164  	srv         *Server
   165  	mu          sync.RWMutex
   166  	state       StreamState
   167  	tombs       []uint64
   168  	ld          *LostStreamData
   169  	scb         StorageUpdateHandler
   170  	ageChk      *time.Timer
   171  	syncTmr     *time.Timer
   172  	cfg         FileStreamInfo
   173  	fcfg        FileStoreConfig
   174  	prf         keyGen
   175  	oldprf      keyGen
   176  	aek         cipher.AEAD
   177  	lmb         *msgBlock
   178  	blks        []*msgBlock
   179  	bim         map[uint32]*msgBlock
   180  	psim        *stree.SubjectTree[psi]
   181  	tsl         int
   182  	adml        int
   183  	hh          hash.Hash64
   184  	qch         chan struct{}
   185  	fch         chan struct{}
   186  	fsld        chan struct{}
   187  	cmu         sync.RWMutex
   188  	cfs         []ConsumerStore
   189  	sips        int
   190  	dirty       int
   191  	closing     bool
   192  	closed      bool
   193  	fip         bool
   194  	receivedAny bool
   195  }
   196  
   197  // Represents a message store block and its data.
   198  type msgBlock struct {
   199  	// Here for 32bit systems and atomic.
   200  	first      msgId
   201  	last       msgId
   202  	mu         sync.RWMutex
   203  	fs         *fileStore
   204  	aek        cipher.AEAD
   205  	bek        cipher.Stream
   206  	seed       []byte
   207  	nonce      []byte
   208  	mfn        string
   209  	mfd        *os.File
   210  	cmp        StoreCompression // Effective compression at the time of loading the block
   211  	liwsz      int64
   212  	index      uint32
   213  	bytes      uint64 // User visible bytes count.
   214  	rbytes     uint64 // Total bytes (raw) including deleted. Used for rolling to new blk.
   215  	msgs       uint64 // User visible message count.
   216  	fss        map[string]*SimpleState
   217  	kfn        string
   218  	lwts       int64
   219  	llts       int64
   220  	lrts       int64
   221  	llseq      uint64
   222  	hh         hash.Hash64
   223  	cache      *cache
   224  	cloads     uint64
   225  	cexp       time.Duration
   226  	ctmr       *time.Timer
   227  	werr       error
   228  	dmap       avl.SequenceSet
   229  	fch        chan struct{}
   230  	qch        chan struct{}
   231  	lchk       [8]byte
   232  	loading    bool
   233  	flusher    bool
   234  	noTrack    bool
   235  	needSync   bool
   236  	syncAlways bool
   237  	closed     bool
   238  
   239  	// Used to mock write failures.
   240  	mockWriteErr bool
   241  }
   242  
   243  // Write through caching layer that is also used on loading messages.
   244  type cache struct {
   245  	buf  []byte
   246  	off  int
   247  	wp   int
   248  	idx  []uint32
   249  	lrl  uint32
   250  	fseq uint64
   251  	nra  bool
   252  }
   253  
   254  type msgId struct {
   255  	seq uint64
   256  	ts  int64
   257  }
   258  
   259  const (
   260  	// Magic is used to identify the file store files.
   261  	magic = uint8(22)
   262  	// Version
   263  	version = uint8(1)
   264  	// New IndexInfo Version
   265  	newVersion = uint8(2)
   266  	// hdrLen
   267  	hdrLen = 2
   268  	// This is where we keep the streams.
   269  	streamsDir = "streams"
   270  	// This is where we keep the message store blocks.
   271  	msgDir = "msgs"
   272  	// This is where we temporarily move the messages dir.
   273  	purgeDir = "__msgs__"
   274  	// used to scan blk file names.
   275  	blkScan = "%d.blk"
   276  	// used for compacted blocks that are staged.
   277  	newScan = "%d.new"
   278  	// used to scan index file names.
   279  	indexScan = "%d.idx"
   280  	// used to store our block encryption key.
   281  	keyScan = "%d.key"
   282  	// to look for orphans
   283  	keyScanAll = "*.key"
   284  	// This is where we keep state on consumers.
   285  	consumerDir = "obs"
   286  	// Index file for a consumer.
   287  	consumerState = "o.dat"
   288  	// The suffix that will be given to a new temporary block during compression.
   289  	compressTmpSuffix = ".tmp"
   290  	// This is where we keep state on templates.
   291  	tmplsDir = "templates"
   292  	// Maximum size of a write buffer we may consider for re-use.
   293  	maxBufReuse = 2 * 1024 * 1024
   294  	// default cache buffer expiration
   295  	defaultCacheBufferExpiration = 2 * time.Second
   296  	// default sync interval
   297  	defaultSyncInterval = 2 * time.Minute
   298  	// default idle timeout to close FDs.
   299  	closeFDsIdle = 30 * time.Second
   300  	// coalesceMinimum
   301  	coalesceMinimum = 16 * 1024
   302  	// maxFlushWait is maximum we will wait to gather messages to flush.
   303  	maxFlushWait = 8 * time.Millisecond
   304  
   305  	// Metafiles for streams and consumers.
   306  	JetStreamMetaFile    = "meta.inf"
   307  	JetStreamMetaFileSum = "meta.sum"
   308  	JetStreamMetaFileKey = "meta.key"
   309  
   310  	// This is the full snapshotted state for the stream.
   311  	streamStreamStateFile = "index.db"
   312  
   313  	// AEK key sizes
   314  	minMetaKeySize = 64
   315  	minBlkKeySize  = 64
   316  
   317  	// Default stream block size.
   318  	defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB
   319  	// Default for workqueue or interest based.
   320  	defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB
   321  	// For smaller reuse buffers. Usually being generated during contention on the lead write buffer.
   322  	// E.g. mirrors/sources etc.
   323  	defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB
   324  	// Maximum size for the encrypted head block.
   325  	maximumEncryptedBlockSize = 2 * 1024 * 1024 // 2MB
   326  	// Default for KV based
   327  	defaultKVBlockSize = defaultMediumBlockSize
   328  	// max block size for now.
   329  	maxBlockSize = defaultLargeBlockSize
   330  	// Compact minimum threshold.
   331  	compactMinimum = 2 * 1024 * 1024 // 2MB
   332  	// FileStoreMinBlkSize is minimum size we will do for a blk size.
   333  	FileStoreMinBlkSize = 32 * 1000 // 32kib
   334  	// FileStoreMaxBlkSize is maximum size we will do for a blk size.
   335  	FileStoreMaxBlkSize = maxBlockSize
   336  	// Check for bad record length value due to corrupt data.
   337  	rlBadThresh = 32 * 1024 * 1024
   338  	// Checksum size for hash for msg records.
   339  	recordHashSize = 8
   340  )
   341  
   342  func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) {
   343  	return newFileStoreWithCreated(fcfg, cfg, time.Now().UTC(), nil, nil)
   344  }
   345  
   346  func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created time.Time, prf, oldprf keyGen) (*fileStore, error) {
   347  	if cfg.Name == _EMPTY_ {
   348  		return nil, fmt.Errorf("name required")
   349  	}
   350  	if cfg.Storage != FileStorage {
   351  		return nil, fmt.Errorf("fileStore requires file storage type in config")
   352  	}
   353  	// Default values.
   354  	if fcfg.BlockSize == 0 {
   355  		fcfg.BlockSize = dynBlkSize(cfg.Retention, cfg.MaxBytes, prf != nil)
   356  	}
   357  	if fcfg.BlockSize > maxBlockSize {
   358  		return nil, fmt.Errorf("filestore max block size is %s", friendlyBytes(maxBlockSize))
   359  	}
   360  	if fcfg.CacheExpire == 0 {
   361  		fcfg.CacheExpire = defaultCacheBufferExpiration
   362  	}
   363  	if fcfg.SyncInterval == 0 {
   364  		fcfg.SyncInterval = defaultSyncInterval
   365  	}
   366  
   367  	// Check the directory
   368  	if stat, err := os.Stat(fcfg.StoreDir); os.IsNotExist(err) {
   369  		if err := os.MkdirAll(fcfg.StoreDir, defaultDirPerms); err != nil {
   370  			return nil, fmt.Errorf("could not create storage directory - %v", err)
   371  		}
   372  	} else if stat == nil || !stat.IsDir() {
   373  		return nil, fmt.Errorf("storage directory is not a directory")
   374  	}
   375  	tmpfile, err := os.CreateTemp(fcfg.StoreDir, "_test_")
   376  	if err != nil {
   377  		return nil, fmt.Errorf("storage directory is not writable")
   378  	}
   379  
   380  	tmpfile.Close()
   381  	<-dios
   382  	os.Remove(tmpfile.Name())
   383  	dios <- struct{}{}
   384  
   385  	fs := &fileStore{
   386  		fcfg:   fcfg,
   387  		psim:   stree.NewSubjectTree[psi](),
   388  		bim:    make(map[uint32]*msgBlock),
   389  		cfg:    FileStreamInfo{Created: created, StreamConfig: cfg},
   390  		prf:    prf,
   391  		oldprf: oldprf,
   392  		qch:    make(chan struct{}),
   393  		fch:    make(chan struct{}, 1),
   394  		fsld:   make(chan struct{}),
   395  		srv:    fcfg.srv,
   396  	}
   397  
   398  	// Set flush in place to AsyncFlush which by default is false.
   399  	fs.fip = !fcfg.AsyncFlush
   400  
   401  	// Check if this is a new setup.
   402  	mdir := filepath.Join(fcfg.StoreDir, msgDir)
   403  	odir := filepath.Join(fcfg.StoreDir, consumerDir)
   404  	if err := os.MkdirAll(mdir, defaultDirPerms); err != nil {
   405  		return nil, fmt.Errorf("could not create message storage directory - %v", err)
   406  	}
   407  	if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
   408  		return nil, fmt.Errorf("could not create consumer storage directory - %v", err)
   409  	}
   410  
   411  	// Create highway hash for message blocks. Use sha256 of directory as key.
   412  	key := sha256.Sum256([]byte(cfg.Name))
   413  	fs.hh, err = highwayhash.New64(key[:])
   414  	if err != nil {
   415  		return nil, fmt.Errorf("could not create hash: %v", err)
   416  	}
   417  
   418  	keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
   419  	// Make sure we do not have an encrypted store underneath of us but no main key.
   420  	if fs.prf == nil {
   421  		if _, err := os.Stat(keyFile); err == nil {
   422  			return nil, errNoMainKey
   423  		}
   424  	}
   425  
   426  	// Attempt to recover our state.
   427  	err = fs.recoverFullState()
   428  	if err != nil {
   429  		// Hold onto state
   430  		prior := fs.state
   431  		// Reset anything that could have been set from above.
   432  		fs.state = StreamState{}
   433  		fs.psim, fs.tsl = fs.psim.Empty(), 0
   434  		fs.bim = make(map[uint32]*msgBlock)
   435  		fs.blks = nil
   436  		fs.tombs = nil
   437  
   438  		// Recover our message state the old way
   439  		if err := fs.recoverMsgs(); err != nil {
   440  			return nil, err
   441  		}
   442  
   443  		// Check if our prior state remembers a last sequence past where we can see.
   444  		if fs.ld != nil && prior.LastSeq > fs.state.LastSeq {
   445  			fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime
   446  			if lmb, err := fs.newMsgBlockForWrite(); err == nil {
   447  				lmb.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano())
   448  			} else {
   449  				return nil, err
   450  			}
   451  		}
   452  		// Since we recovered here, make sure to kick ourselves to write out our stream state.
   453  		fs.dirty++
   454  		defer fs.kickFlushStateLoop()
   455  	}
   456  
   457  	// Also make sure we get rid of old idx and fss files on return.
   458  	// Do this in separate go routine vs inline and at end of processing.
   459  	defer func() {
   460  		go fs.cleanupOldMeta()
   461  	}()
   462  
   463  	// Lock while do enforcements and removals.
   464  	fs.mu.Lock()
   465  
   466  	// Check if we have any left over tombstones to process.
   467  	if len(fs.tombs) > 0 {
   468  		for _, seq := range fs.tombs {
   469  			fs.removeMsg(seq, false, true, false)
   470  			fs.removeFromLostData(seq)
   471  		}
   472  		// Not needed after this phase.
   473  		fs.tombs = nil
   474  	}
   475  
   476  	// Limits checks and enforcement.
   477  	fs.enforceMsgLimit()
   478  	fs.enforceBytesLimit()
   479  
   480  	// Do age checks too, make sure to call in place.
   481  	if fs.cfg.MaxAge != 0 {
   482  		fs.expireMsgsOnRecover()
   483  		fs.startAgeChk()
   484  	}
   485  
   486  	// If we have max msgs per subject make sure the is also enforced.
   487  	if fs.cfg.MaxMsgsPer > 0 {
   488  		fs.enforceMsgPerSubjectLimit(false)
   489  	}
   490  
   491  	// Grab first sequence for check below while we have lock.
   492  	firstSeq := fs.state.FirstSeq
   493  	fs.mu.Unlock()
   494  
   495  	// If the stream has an initial sequence number then make sure we
   496  	// have purged up until that point. We will do this only if the
   497  	// recovered first sequence number is before our configured first
   498  	// sequence. Need to do this locked as by now the age check timer
   499  	// has started.
   500  	if cfg.FirstSeq > 0 && firstSeq <= cfg.FirstSeq {
   501  		if _, err := fs.purge(cfg.FirstSeq); err != nil {
   502  			return nil, err
   503  		}
   504  	}
   505  
   506  	// Write our meta data if it does not exist or is zero'd out.
   507  	meta := filepath.Join(fcfg.StoreDir, JetStreamMetaFile)
   508  	fi, err := os.Stat(meta)
   509  	if err != nil && os.IsNotExist(err) || fi != nil && fi.Size() == 0 {
   510  		if err := fs.writeStreamMeta(); err != nil {
   511  			return nil, err
   512  		}
   513  	}
   514  
   515  	// If we expect to be encrypted check that what we are restoring is not plaintext.
   516  	// This can happen on snapshot restores or conversions.
   517  	if fs.prf != nil {
   518  		if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
   519  			if err := fs.writeStreamMeta(); err != nil {
   520  				return nil, err
   521  			}
   522  		}
   523  	}
   524  
   525  	// Setup our sync timer.
   526  	fs.setSyncTimer()
   527  
   528  	// Spin up the go routine that will write out or full state stream index.
   529  	go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld)
   530  
   531  	return fs, nil
   532  }
   533  
   534  // Lock all existing message blocks.
   535  // Lock held on entry.
   536  func (fs *fileStore) lockAllMsgBlocks() {
   537  	for _, mb := range fs.blks {
   538  		mb.mu.Lock()
   539  	}
   540  }
   541  
   542  // Unlock all existing message blocks.
   543  // Lock held on entry.
   544  func (fs *fileStore) unlockAllMsgBlocks() {
   545  	for _, mb := range fs.blks {
   546  		mb.mu.Unlock()
   547  	}
   548  }
   549  
   550  func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error {
   551  	if fs.isClosed() {
   552  		return ErrStoreClosed
   553  	}
   554  	if cfg.Name == _EMPTY_ {
   555  		return fmt.Errorf("name required")
   556  	}
   557  	if cfg.Storage != FileStorage {
   558  		return fmt.Errorf("fileStore requires file storage type in config")
   559  	}
   560  
   561  	fs.mu.Lock()
   562  	new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg}
   563  	old_cfg := fs.cfg
   564  	// The reference story has changed here, so this full msg block lock
   565  	// may not be needed.
   566  	fs.lockAllMsgBlocks()
   567  	fs.cfg = new_cfg
   568  	fs.unlockAllMsgBlocks()
   569  	if err := fs.writeStreamMeta(); err != nil {
   570  		fs.lockAllMsgBlocks()
   571  		fs.cfg = old_cfg
   572  		fs.unlockAllMsgBlocks()
   573  		fs.mu.Unlock()
   574  		return err
   575  	}
   576  
   577  	// Limits checks and enforcement.
   578  	fs.enforceMsgLimit()
   579  	fs.enforceBytesLimit()
   580  
   581  	// Do age timers.
   582  	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
   583  		fs.startAgeChk()
   584  	}
   585  	if fs.ageChk != nil && fs.cfg.MaxAge == 0 {
   586  		fs.ageChk.Stop()
   587  		fs.ageChk = nil
   588  	}
   589  
   590  	if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer {
   591  		fs.enforceMsgPerSubjectLimit(true)
   592  	}
   593  	fs.mu.Unlock()
   594  
   595  	if cfg.MaxAge != 0 {
   596  		fs.expireMsgs()
   597  	}
   598  	return nil
   599  }
   600  
   601  func dynBlkSize(retention RetentionPolicy, maxBytes int64, encrypted bool) uint64 {
   602  	if maxBytes > 0 {
   603  		blkSize := (maxBytes / 4) + 1 // (25% overhead)
   604  		// Round up to nearest 100
   605  		if m := blkSize % 100; m != 0 {
   606  			blkSize += 100 - m
   607  		}
   608  		if blkSize <= FileStoreMinBlkSize {
   609  			blkSize = FileStoreMinBlkSize
   610  		} else if blkSize >= FileStoreMaxBlkSize {
   611  			blkSize = FileStoreMaxBlkSize
   612  		} else {
   613  			blkSize = defaultMediumBlockSize
   614  		}
   615  		if encrypted && blkSize > maximumEncryptedBlockSize {
   616  			// Notes on this below.
   617  			blkSize = maximumEncryptedBlockSize
   618  		}
   619  		return uint64(blkSize)
   620  	}
   621  
   622  	switch {
   623  	case encrypted:
   624  		// In the case of encrypted stores, large blocks can result in worsened perf
   625  		// since many writes on disk involve re-encrypting the entire block. For now,
   626  		// we will enforce a cap on the block size when encryption is enabled to avoid
   627  		// this.
   628  		return maximumEncryptedBlockSize
   629  	case retention == LimitsPolicy:
   630  		// TODO(dlc) - Make the blocksize relative to this if set.
   631  		return defaultLargeBlockSize
   632  	default:
   633  		// TODO(dlc) - Make the blocksize relative to this if set.
   634  		return defaultMediumBlockSize
   635  	}
   636  }
   637  
   638  func genEncryptionKey(sc StoreCipher, seed []byte) (ek cipher.AEAD, err error) {
   639  	if sc == ChaCha {
   640  		ek, err = chacha20poly1305.NewX(seed)
   641  	} else if sc == AES {
   642  		block, e := aes.NewCipher(seed)
   643  		if e != nil {
   644  			return nil, err
   645  		}
   646  		ek, err = cipher.NewGCMWithNonceSize(block, block.BlockSize())
   647  	} else {
   648  		err = errUnknownCipher
   649  	}
   650  	return ek, err
   651  }
   652  
   653  // Generate an asset encryption key from the context and server PRF.
   654  func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cipher.Stream, seed, encrypted []byte, err error) {
   655  	if fs.prf == nil {
   656  		return nil, nil, nil, nil, errNoEncryption
   657  	}
   658  	// Generate key encryption key.
   659  	rb, err := fs.prf([]byte(context))
   660  	if err != nil {
   661  		return nil, nil, nil, nil, err
   662  	}
   663  
   664  	sc := fs.fcfg.Cipher
   665  
   666  	kek, err := genEncryptionKey(sc, rb)
   667  	if err != nil {
   668  		return nil, nil, nil, nil, err
   669  	}
   670  	// Generate random asset encryption key seed.
   671  
   672  	const seedSize = 32
   673  	seed = make([]byte, seedSize)
   674  	if n, err := rand.Read(seed); err != nil || n != seedSize {
   675  		return nil, nil, nil, nil, err
   676  	}
   677  
   678  	aek, err = genEncryptionKey(sc, seed)
   679  	if err != nil {
   680  		return nil, nil, nil, nil, err
   681  	}
   682  
   683  	// Generate our nonce. Use same buffer to hold encrypted seed.
   684  	nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead())
   685  	rand.Read(nonce)
   686  
   687  	bek, err = genBlockEncryptionKey(sc, seed[:], nonce)
   688  	if err != nil {
   689  		return nil, nil, nil, nil, err
   690  	}
   691  
   692  	return aek, bek, seed, kek.Seal(nonce, nonce, seed, nil), nil
   693  }
   694  
   695  // Will generate the block encryption key.
   696  func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, error) {
   697  	if sc == ChaCha {
   698  		return chacha20.NewUnauthenticatedCipher(seed, nonce)
   699  	} else if sc == AES {
   700  		block, err := aes.NewCipher(seed)
   701  		if err != nil {
   702  			return nil, err
   703  		}
   704  		return cipher.NewCTR(block, nonce), nil
   705  	}
   706  	return nil, errUnknownCipher
   707  }
   708  
   709  // Lock should be held.
   710  func (fs *fileStore) recoverAEK() error {
   711  	if fs.prf != nil && fs.aek == nil {
   712  		ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey))
   713  		if err != nil {
   714  			return err
   715  		}
   716  		rb, err := fs.prf([]byte(fs.cfg.Name))
   717  		if err != nil {
   718  			return err
   719  		}
   720  		kek, err := genEncryptionKey(fs.fcfg.Cipher, rb)
   721  		if err != nil {
   722  			return err
   723  		}
   724  		ns := kek.NonceSize()
   725  		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
   726  		if err != nil {
   727  			return err
   728  		}
   729  		aek, err := genEncryptionKey(fs.fcfg.Cipher, seed)
   730  		if err != nil {
   731  			return err
   732  		}
   733  		fs.aek = aek
   734  	}
   735  	return nil
   736  }
   737  
   738  // Lock should be held.
   739  func (fs *fileStore) setupAEK() error {
   740  	if fs.prf != nil && fs.aek == nil {
   741  		key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name)
   742  		if err != nil {
   743  			return err
   744  		}
   745  		keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)
   746  		if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
   747  			return err
   748  		}
   749  		<-dios
   750  		err = os.WriteFile(keyFile, encrypted, defaultFilePerms)
   751  		dios <- struct{}{}
   752  		if err != nil {
   753  			return err
   754  		}
   755  		// Set our aek.
   756  		fs.aek = key
   757  	}
   758  	return nil
   759  }
   760  
   761  // Write out meta and the checksum.
   762  // Lock should be held.
   763  func (fs *fileStore) writeStreamMeta() error {
   764  	if err := fs.setupAEK(); err != nil {
   765  		return err
   766  	}
   767  
   768  	meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)
   769  	if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
   770  		return err
   771  	}
   772  	b, err := json.Marshal(fs.cfg)
   773  	if err != nil {
   774  		return err
   775  	}
   776  	// Encrypt if needed.
   777  	if fs.aek != nil {
   778  		nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead())
   779  		rand.Read(nonce)
   780  		b = fs.aek.Seal(nonce, nonce, b, nil)
   781  	}
   782  
   783  	<-dios
   784  	err = os.WriteFile(meta, b, defaultFilePerms)
   785  	dios <- struct{}{}
   786  	if err != nil {
   787  		return err
   788  	}
   789  	fs.hh.Reset()
   790  	fs.hh.Write(b)
   791  	checksum := hex.EncodeToString(fs.hh.Sum(nil))
   792  	sum := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileSum)
   793  	<-dios
   794  	err = os.WriteFile(sum, []byte(checksum), defaultFilePerms)
   795  	dios <- struct{}{}
   796  	if err != nil {
   797  		return err
   798  	}
   799  	return nil
   800  }
   801  
   802  // Pools to recycle the blocks to help with memory pressure.
   803  var blkPoolBig sync.Pool    // 16MB
   804  var blkPoolMedium sync.Pool // 8MB
   805  var blkPoolSmall sync.Pool  // 2MB
   806  
   807  // Get a new msg block based on sz estimate.
   808  func getMsgBlockBuf(sz int) (buf []byte) {
   809  	var pb any
   810  	if sz <= defaultSmallBlockSize {
   811  		pb = blkPoolSmall.Get()
   812  	} else if sz <= defaultMediumBlockSize {
   813  		pb = blkPoolMedium.Get()
   814  	} else {
   815  		pb = blkPoolBig.Get()
   816  	}
   817  	if pb != nil {
   818  		buf = *(pb.(*[]byte))
   819  	} else {
   820  		// Here we need to make a new blk.
   821  		// If small leave as is..
   822  		if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize {
   823  			sz = defaultMediumBlockSize
   824  		} else if sz > defaultMediumBlockSize {
   825  			sz = defaultLargeBlockSize
   826  		}
   827  		buf = make([]byte, sz)
   828  	}
   829  	return buf[:0]
   830  }
   831  
   832  // Recycle the msg block.
   833  func recycleMsgBlockBuf(buf []byte) {
   834  	if buf == nil || cap(buf) < defaultSmallBlockSize {
   835  		return
   836  	}
   837  	// Make sure to reset before placing back into pool.
   838  	buf = buf[:0]
   839  
   840  	// We need to make sure the load code gets a block that can fit the maximum for a size block.
   841  	// E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting
   842  	// it right back in and making a new []byte.
   843  	// From above we know its already >= defaultSmallBlockSize
   844  	if sz := cap(buf); sz < defaultMediumBlockSize {
   845  		blkPoolSmall.Put(&buf)
   846  	} else if sz < defaultLargeBlockSize {
   847  		blkPoolMedium.Put(&buf)
   848  	} else {
   849  		blkPoolBig.Put(&buf)
   850  	}
   851  }
   852  
   853  const (
   854  	msgHdrSize     = 22
   855  	checksumSize   = 8
   856  	emptyRecordLen = msgHdrSize + checksumSize
   857  )
   858  
   859  // Lock should be held.
   860  func (fs *fileStore) noTrackSubjects() bool {
   861  	return !(fs.psim.Size() > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0)
   862  }
   863  
   864  // Will init the basics for a message block.
   865  func (fs *fileStore) initMsgBlock(index uint32) *msgBlock {
   866  	mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways}
   867  
   868  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
   869  	mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index))
   870  
   871  	if mb.hh == nil {
   872  		key := sha256.Sum256(fs.hashKeyForBlock(index))
   873  		mb.hh, _ = highwayhash.New64(key[:])
   874  	}
   875  	return mb
   876  }
   877  
   878  // Lock for fs should be held.
   879  func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error {
   880  	if fs.prf == nil {
   881  		return nil
   882  	}
   883  
   884  	var createdKeys bool
   885  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
   886  	ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
   887  	if err != nil {
   888  		// We do not seem to have keys even though we should. Could be a plaintext conversion.
   889  		// Create the keys and we will double check below.
   890  		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
   891  			return err
   892  		}
   893  		createdKeys = true
   894  	} else {
   895  		if len(ekey) < minBlkKeySize {
   896  			return errBadKeySize
   897  		}
   898  		// Recover key encryption key.
   899  		rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
   900  		if err != nil {
   901  			return err
   902  		}
   903  
   904  		sc := fs.fcfg.Cipher
   905  		kek, err := genEncryptionKey(sc, rb)
   906  		if err != nil {
   907  			return err
   908  		}
   909  		ns := kek.NonceSize()
   910  		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
   911  		if err != nil {
   912  			// We may be here on a cipher conversion, so attempt to convert.
   913  			if err = mb.convertCipher(); err != nil {
   914  				return err
   915  			}
   916  		} else {
   917  			mb.seed, mb.nonce = seed, ekey[:ns]
   918  		}
   919  		mb.aek, err = genEncryptionKey(sc, mb.seed)
   920  		if err != nil {
   921  			return err
   922  		}
   923  		if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil {
   924  			return err
   925  		}
   926  	}
   927  
   928  	// If we created keys here, let's check the data and if it is plaintext convert here.
   929  	if createdKeys {
   930  		if err := mb.convertToEncrypted(); err != nil {
   931  			return err
   932  		}
   933  	}
   934  
   935  	return nil
   936  }
   937  
   938  // Load a last checksum if needed from the block file.
   939  // Lock should be held.
   940  func (mb *msgBlock) ensureLastChecksumLoaded() {
   941  	var empty [8]byte
   942  	if mb.lchk != empty {
   943  		return
   944  	}
   945  	copy(mb.lchk[0:], mb.lastChecksum())
   946  }
   947  
   948  // Perform a recover but do not update PSIM.
   949  // Lock should be held.
   950  func (fs *fileStore) recoverMsgBlockNoSubjectUpdates(index uint32) (*msgBlock, error) {
   951  	psim, tsl := fs.psim, fs.tsl
   952  	fs.psim = nil
   953  	mb, err := fs.recoverMsgBlock(index)
   954  	fs.psim, fs.tsl = psim, tsl
   955  	return mb, err
   956  }
   957  
   958  // Lock held on entry
   959  func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) {
   960  	mb := fs.initMsgBlock(index)
   961  
   962  	// Open up the message file, but we will try to recover from the index file.
   963  	// We will check that the last checksums match.
   964  	file, err := mb.openBlock()
   965  	if err != nil {
   966  		return nil, err
   967  	}
   968  	defer file.Close()
   969  
   970  	if fi, err := file.Stat(); fi != nil {
   971  		mb.rbytes = uint64(fi.Size())
   972  	} else {
   973  		return nil, err
   974  	}
   975  
   976  	// Make sure encryption loaded if needed.
   977  	fs.loadEncryptionForMsgBlock(mb)
   978  
   979  	// Grab last checksum from main block file.
   980  	var lchk [8]byte
   981  	if mb.rbytes >= checksumSize {
   982  		if mb.bek != nil {
   983  			if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
   984  				mb.bek.XORKeyStream(buf, buf)
   985  				copy(lchk[0:], buf[len(buf)-checksumSize:])
   986  			}
   987  		} else {
   988  			file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
   989  		}
   990  	}
   991  
   992  	file.Close()
   993  
   994  	// Read our index file. Use this as source of truth if possible.
   995  	if err := mb.readIndexInfo(); err == nil {
   996  		// Quick sanity check here.
   997  		// Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty.
   998  		if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) {
   999  			if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
  1000  				fs.populateGlobalPerSubjectInfo(mb)
  1001  				// Try to dump any state we needed on recovery.
  1002  				mb.tryForceExpireCacheLocked()
  1003  			}
  1004  			fs.addMsgBlock(mb)
  1005  			return mb, nil
  1006  		}
  1007  	}
  1008  
  1009  	// If we get data loss rebuilding the message block state record that with the fs itself.
  1010  	ld, tombs, _ := mb.rebuildState()
  1011  	if ld != nil {
  1012  		fs.addLostData(ld)
  1013  	}
  1014  	// Collect all tombstones.
  1015  	if len(tombs) > 0 {
  1016  		fs.tombs = append(fs.tombs, tombs...)
  1017  	}
  1018  
  1019  	if mb.msgs > 0 && !mb.noTrack && fs.psim != nil {
  1020  		fs.populateGlobalPerSubjectInfo(mb)
  1021  		// Try to dump any state we needed on recovery.
  1022  		mb.tryForceExpireCacheLocked()
  1023  	}
  1024  
  1025  	mb.closeFDs()
  1026  	fs.addMsgBlock(mb)
  1027  
  1028  	return mb, nil
  1029  }
  1030  
  1031  func (fs *fileStore) lostData() *LostStreamData {
  1032  	fs.mu.RLock()
  1033  	defer fs.mu.RUnlock()
  1034  	if fs.ld == nil {
  1035  		return nil
  1036  	}
  1037  	nld := *fs.ld
  1038  	return &nld
  1039  }
  1040  
  1041  // Lock should be held.
  1042  func (fs *fileStore) addLostData(ld *LostStreamData) {
  1043  	if ld == nil {
  1044  		return
  1045  	}
  1046  	if fs.ld != nil {
  1047  		var added bool
  1048  		for _, seq := range ld.Msgs {
  1049  			if _, found := fs.ld.exists(seq); !found {
  1050  				fs.ld.Msgs = append(fs.ld.Msgs, seq)
  1051  				added = true
  1052  			}
  1053  		}
  1054  		if added {
  1055  			msgs := fs.ld.Msgs
  1056  			sort.Slice(msgs, func(i, j int) bool { return msgs[i] < msgs[j] })
  1057  			fs.ld.Bytes += ld.Bytes
  1058  		}
  1059  	} else {
  1060  		fs.ld = ld
  1061  	}
  1062  }
  1063  
  1064  // Helper to see if we already have this sequence reported in our lost data.
  1065  func (ld *LostStreamData) exists(seq uint64) (int, bool) {
  1066  	i, found := sort.Find(len(ld.Msgs), func(i int) int {
  1067  		tseq := ld.Msgs[i]
  1068  		if tseq < seq {
  1069  			return -1
  1070  		}
  1071  		if tseq > seq {
  1072  			return +1
  1073  		}
  1074  		return 0
  1075  	})
  1076  	return i, found
  1077  }
  1078  
  1079  func (fs *fileStore) removeFromLostData(seq uint64) {
  1080  	if fs.ld == nil {
  1081  		return
  1082  	}
  1083  	if i, found := fs.ld.exists(seq); found {
  1084  		fs.ld.Msgs = append(fs.ld.Msgs[:i], fs.ld.Msgs[i+1:]...)
  1085  		if len(fs.ld.Msgs) == 0 {
  1086  			fs.ld = nil
  1087  		}
  1088  	}
  1089  }
  1090  
  1091  func (fs *fileStore) rebuildState(ld *LostStreamData) {
  1092  	fs.mu.Lock()
  1093  	defer fs.mu.Unlock()
  1094  	fs.rebuildStateLocked(ld)
  1095  }
  1096  
  1097  // Lock should be held.
  1098  func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) {
  1099  	fs.addLostData(ld)
  1100  
  1101  	fs.state.Msgs, fs.state.Bytes = 0, 0
  1102  	fs.state.FirstSeq, fs.state.LastSeq = 0, 0
  1103  
  1104  	for _, mb := range fs.blks {
  1105  		mb.mu.RLock()
  1106  		fs.state.Msgs += mb.msgs
  1107  		fs.state.Bytes += mb.bytes
  1108  		fseq := atomic.LoadUint64(&mb.first.seq)
  1109  		if fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq {
  1110  			fs.state.FirstSeq = fseq
  1111  			fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  1112  		}
  1113  		fs.state.LastSeq = atomic.LoadUint64(&mb.last.seq)
  1114  		fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
  1115  		mb.mu.RUnlock()
  1116  	}
  1117  }
  1118  
  1119  // Attempt to convert the cipher used for this message block.
  1120  func (mb *msgBlock) convertCipher() error {
  1121  	fs := mb.fs
  1122  	sc := fs.fcfg.Cipher
  1123  
  1124  	var osc StoreCipher
  1125  	switch sc {
  1126  	case ChaCha:
  1127  		osc = AES
  1128  	case AES:
  1129  		osc = ChaCha
  1130  	}
  1131  
  1132  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  1133  	ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)))
  1134  	if err != nil {
  1135  		return err
  1136  	}
  1137  	if len(ekey) < minBlkKeySize {
  1138  		return errBadKeySize
  1139  	}
  1140  	type prfWithCipher struct {
  1141  		keyGen
  1142  		StoreCipher
  1143  	}
  1144  	var prfs []prfWithCipher
  1145  	if fs.prf != nil {
  1146  		prfs = append(prfs, prfWithCipher{fs.prf, sc})
  1147  		prfs = append(prfs, prfWithCipher{fs.prf, osc})
  1148  	}
  1149  	if fs.oldprf != nil {
  1150  		prfs = append(prfs, prfWithCipher{fs.oldprf, sc})
  1151  		prfs = append(prfs, prfWithCipher{fs.oldprf, osc})
  1152  	}
  1153  
  1154  	for _, prf := range prfs {
  1155  		// Recover key encryption key.
  1156  		rb, err := prf.keyGen([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)))
  1157  		if err != nil {
  1158  			continue
  1159  		}
  1160  		kek, err := genEncryptionKey(prf.StoreCipher, rb)
  1161  		if err != nil {
  1162  			continue
  1163  		}
  1164  		ns := kek.NonceSize()
  1165  		seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil)
  1166  		if err != nil {
  1167  			continue
  1168  		}
  1169  		nonce := ekey[:ns]
  1170  		bek, err := genBlockEncryptionKey(prf.StoreCipher, seed, nonce)
  1171  		if err != nil {
  1172  			return err
  1173  		}
  1174  
  1175  		buf, _ := mb.loadBlock(nil)
  1176  		bek.XORKeyStream(buf, buf)
  1177  		// Make sure we can parse with old cipher and key file.
  1178  		if err = mb.indexCacheBuf(buf); err != nil {
  1179  			return err
  1180  		}
  1181  		// Reset the cache since we just read everything in.
  1182  		mb.cache = nil
  1183  
  1184  		// Generate new keys. If we error for some reason then we will put
  1185  		// the old keyfile back.
  1186  		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
  1187  			keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
  1188  			<-dios
  1189  			os.WriteFile(keyFile, ekey, defaultFilePerms)
  1190  			dios <- struct{}{}
  1191  			return err
  1192  		}
  1193  		mb.bek.XORKeyStream(buf, buf)
  1194  		<-dios
  1195  		err = os.WriteFile(mb.mfn, buf, defaultFilePerms)
  1196  		dios <- struct{}{}
  1197  		if err != nil {
  1198  			return err
  1199  		}
  1200  		return nil
  1201  	}
  1202  	return fmt.Errorf("unable to recover keys")
  1203  }
  1204  
  1205  // Convert a plaintext block to encrypted.
  1206  func (mb *msgBlock) convertToEncrypted() error {
  1207  	if mb.bek == nil {
  1208  		return nil
  1209  	}
  1210  	buf, err := mb.loadBlock(nil)
  1211  	if err != nil {
  1212  		return err
  1213  	}
  1214  	if err := mb.indexCacheBuf(buf); err != nil {
  1215  		// This likely indicates this was already encrypted or corrupt.
  1216  		mb.cache = nil
  1217  		return err
  1218  	}
  1219  	// Undo cache from above for later.
  1220  	mb.cache = nil
  1221  	mb.bek.XORKeyStream(buf, buf)
  1222  	<-dios
  1223  	err = os.WriteFile(mb.mfn, buf, defaultFilePerms)
  1224  	dios <- struct{}{}
  1225  	if err != nil {
  1226  		return err
  1227  	}
  1228  	return nil
  1229  }
  1230  
  1231  // Rebuild the state of the blk based on what we have on disk in the N.blk file.
  1232  // We will return any lost data, and we will return any delete tombstones we encountered.
  1233  func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) {
  1234  	mb.mu.Lock()
  1235  	defer mb.mu.Unlock()
  1236  	return mb.rebuildStateLocked()
  1237  }
  1238  
  1239  // Rebuild the state of the blk based on what we have on disk in the N.blk file.
  1240  // Lock should be held.
  1241  func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) {
  1242  	startLastSeq := atomic.LoadUint64(&mb.last.seq)
  1243  
  1244  	// Remove the .fss file and clear any cache we have set.
  1245  	mb.clearCacheAndOffset()
  1246  
  1247  	buf, err := mb.loadBlock(nil)
  1248  	defer recycleMsgBlockBuf(buf)
  1249  
  1250  	if err != nil || len(buf) == 0 {
  1251  		var ld *LostStreamData
  1252  		// No data to rebuild from here.
  1253  		if mb.msgs > 0 {
  1254  			// We need to declare lost data here.
  1255  			ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes}
  1256  			firstSeq, lastSeq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  1257  			for seq := firstSeq; seq <= lastSeq; seq++ {
  1258  				if !mb.dmap.Exists(seq) {
  1259  					ld.Msgs = append(ld.Msgs, seq)
  1260  				}
  1261  			}
  1262  			// Clear invalid state. We will let this blk be added in here.
  1263  			mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
  1264  			mb.dmap.Empty()
  1265  			atomic.StoreUint64(&mb.first.seq, atomic.LoadUint64(&mb.last.seq)+1)
  1266  		}
  1267  		return ld, nil, err
  1268  	}
  1269  
  1270  	// Clear state we need to rebuild.
  1271  	mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil
  1272  	atomic.StoreUint64(&mb.last.seq, 0)
  1273  	mb.last.ts = 0
  1274  	firstNeedsSet := true
  1275  
  1276  	// Check if we need to decrypt.
  1277  	if mb.bek != nil && len(buf) > 0 {
  1278  		// Recreate to reset counter.
  1279  		mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  1280  		if err != nil {
  1281  			return nil, nil, err
  1282  		}
  1283  		mb.bek.XORKeyStream(buf, buf)
  1284  	}
  1285  
  1286  	// Check for compression.
  1287  	if buf, err = mb.decompressIfNeeded(buf); err != nil {
  1288  		return nil, nil, err
  1289  	}
  1290  
  1291  	mb.rbytes = uint64(len(buf))
  1292  
  1293  	addToDmap := func(seq uint64) {
  1294  		if seq == 0 {
  1295  			return
  1296  		}
  1297  		mb.dmap.Insert(seq)
  1298  	}
  1299  
  1300  	var le = binary.LittleEndian
  1301  
  1302  	truncate := func(index uint32) {
  1303  		var fd *os.File
  1304  		if mb.mfd != nil {
  1305  			fd = mb.mfd
  1306  		} else {
  1307  			<-dios
  1308  			fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
  1309  			dios <- struct{}{}
  1310  			if err == nil {
  1311  				defer fd.Close()
  1312  			}
  1313  		}
  1314  		if fd == nil {
  1315  			return
  1316  		}
  1317  		if err := fd.Truncate(int64(index)); err == nil {
  1318  			// Update our checksum.
  1319  			if index >= 8 {
  1320  				var lchk [8]byte
  1321  				fd.ReadAt(lchk[:], int64(index-8))
  1322  				copy(mb.lchk[0:], lchk[:])
  1323  			}
  1324  			fd.Sync()
  1325  		}
  1326  	}
  1327  
  1328  	gatherLost := func(lb uint32) *LostStreamData {
  1329  		var ld LostStreamData
  1330  		for seq := atomic.LoadUint64(&mb.last.seq) + 1; seq <= startLastSeq; seq++ {
  1331  			ld.Msgs = append(ld.Msgs, seq)
  1332  		}
  1333  		ld.Bytes = uint64(lb)
  1334  		return &ld
  1335  	}
  1336  
  1337  	// For tombstones that we find and collect.
  1338  	var (
  1339  		tombstones      []uint64
  1340  		minTombstoneSeq uint64
  1341  		minTombstoneTs  int64
  1342  	)
  1343  
  1344  	for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
  1345  		if index+msgHdrSize > lbuf {
  1346  			truncate(index)
  1347  			return gatherLost(lbuf - index), tombstones, nil
  1348  		}
  1349  
  1350  		hdr := buf[index : index+msgHdrSize]
  1351  		rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
  1352  
  1353  		hasHeaders := rl&hbit != 0
  1354  		// Clear any headers bit that could be set.
  1355  		rl &^= hbit
  1356  		dlen := int(rl) - msgHdrSize
  1357  		// Do some quick sanity checks here.
  1358  		if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
  1359  			truncate(index)
  1360  			return gatherLost(lbuf - index), tombstones, errBadMsg
  1361  		}
  1362  
  1363  		// Check for checksum failures before additional processing.
  1364  		data := buf[index+msgHdrSize : index+rl]
  1365  		if hh := mb.hh; hh != nil {
  1366  			hh.Reset()
  1367  			hh.Write(hdr[4:20])
  1368  			hh.Write(data[:slen])
  1369  			if hasHeaders {
  1370  				hh.Write(data[slen+4 : dlen-recordHashSize])
  1371  			} else {
  1372  				hh.Write(data[slen : dlen-recordHashSize])
  1373  			}
  1374  			checksum := hh.Sum(nil)
  1375  			if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) {
  1376  				truncate(index)
  1377  				return gatherLost(lbuf - index), tombstones, errBadMsg
  1378  			}
  1379  			copy(mb.lchk[0:], checksum)
  1380  		}
  1381  
  1382  		// Grab our sequence and timestamp.
  1383  		seq := le.Uint64(hdr[4:])
  1384  		ts := int64(le.Uint64(hdr[12:]))
  1385  
  1386  		// Check if this is a delete tombstone.
  1387  		if seq&tbit != 0 {
  1388  			seq = seq &^ tbit
  1389  			// Need to process this here and make sure we have accounted for this properly.
  1390  			tombstones = append(tombstones, seq)
  1391  			if minTombstoneSeq == 0 || seq < minTombstoneSeq {
  1392  				minTombstoneSeq, minTombstoneTs = seq, ts
  1393  			}
  1394  			index += rl
  1395  			continue
  1396  		}
  1397  
  1398  		fseq := atomic.LoadUint64(&mb.first.seq)
  1399  		// This is an old erased message, or a new one that we can track.
  1400  		if seq == 0 || seq&ebit != 0 || seq < fseq {
  1401  			seq = seq &^ ebit
  1402  			if seq >= fseq {
  1403  				// Only add to dmap if past recorded first seq and non-zero.
  1404  				if seq != 0 {
  1405  					addToDmap(seq)
  1406  				}
  1407  				atomic.StoreUint64(&mb.last.seq, seq)
  1408  				mb.last.ts = ts
  1409  				if mb.msgs == 0 {
  1410  					atomic.StoreUint64(&mb.first.seq, seq+1)
  1411  					mb.first.ts = 0
  1412  				}
  1413  			}
  1414  			index += rl
  1415  			continue
  1416  		}
  1417  
  1418  		// This is for when we have index info that adjusts for deleted messages
  1419  		// at the head. So the first.seq will be already set here. If this is larger
  1420  		// replace what we have with this seq.
  1421  		if firstNeedsSet && seq >= fseq {
  1422  			atomic.StoreUint64(&mb.first.seq, seq)
  1423  			firstNeedsSet, mb.first.ts = false, ts
  1424  		}
  1425  
  1426  		if !mb.dmap.Exists(seq) {
  1427  			mb.msgs++
  1428  			mb.bytes += uint64(rl)
  1429  		}
  1430  
  1431  		// Always set last
  1432  		atomic.StoreUint64(&mb.last.seq, seq)
  1433  		mb.last.ts = ts
  1434  
  1435  		// Advance to next record.
  1436  		index += rl
  1437  	}
  1438  
  1439  	// For empty msg blocks make sure we recover last seq correctly based off of first.
  1440  	// Or if we seem to have no messages but had a tombstone, which we use to remember
  1441  	// sequences and timestamps now, use that to properly setup the first and last.
  1442  	if mb.msgs == 0 {
  1443  		fseq := atomic.LoadUint64(&mb.first.seq)
  1444  		if fseq > 0 {
  1445  			atomic.StoreUint64(&mb.last.seq, fseq-1)
  1446  		} else if fseq == 0 && minTombstoneSeq > 0 {
  1447  			atomic.StoreUint64(&mb.first.seq, minTombstoneSeq+1)
  1448  			mb.first.ts = 0
  1449  			if mb.last.seq == 0 {
  1450  				atomic.StoreUint64(&mb.last.seq, minTombstoneSeq)
  1451  				mb.last.ts = minTombstoneTs
  1452  			}
  1453  		}
  1454  	}
  1455  
  1456  	return nil, tombstones, nil
  1457  }
  1458  
  1459  // For doing warn logging.
  1460  // Lock should be held.
  1461  func (fs *fileStore) warn(format string, args ...any) {
  1462  	// No-op if no server configured.
  1463  	if fs.srv == nil {
  1464  		return
  1465  	}
  1466  	fs.srv.Warnf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...)
  1467  }
  1468  
  1469  // For doing debug logging.
  1470  // Lock should be held.
  1471  func (fs *fileStore) debug(format string, args ...any) {
  1472  	// No-op if no server configured.
  1473  	if fs.srv == nil {
  1474  		return
  1475  	}
  1476  	fs.srv.Debugf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...)
  1477  }
  1478  
  1479  // Track local state but ignore timestamps here.
  1480  func updateTrackingState(state *StreamState, mb *msgBlock) {
  1481  	if state.FirstSeq == 0 {
  1482  		state.FirstSeq = mb.first.seq
  1483  	} else if mb.first.seq < state.FirstSeq {
  1484  		state.FirstSeq = mb.first.seq
  1485  	}
  1486  	if mb.last.seq > state.LastSeq {
  1487  		state.LastSeq = mb.last.seq
  1488  	}
  1489  	state.Msgs += mb.msgs
  1490  	state.Bytes += mb.bytes
  1491  }
  1492  
  1493  // Determine if our tracking states are the same.
  1494  func trackingStatesEqual(fs, mb *StreamState) bool {
  1495  	// When a fs is brand new the fs state will have first seq of 0, but tracking mb may have 1.
  1496  	// If either has a first sequence that is not 0 or 1 we will check if they are the same, otherwise skip.
  1497  	if fs.FirstSeq > 1 || mb.FirstSeq > 1 {
  1498  		return fs.Msgs == mb.Msgs && fs.FirstSeq == mb.FirstSeq && fs.LastSeq == mb.LastSeq && fs.Bytes == mb.Bytes
  1499  	}
  1500  	return fs.Msgs == mb.Msgs && fs.LastSeq == mb.LastSeq && fs.Bytes == mb.Bytes
  1501  }
  1502  
  1503  // recoverFullState will attempt to receover our last full state and re-process any state changes
  1504  // that happened afterwards.
  1505  func (fs *fileStore) recoverFullState() (rerr error) {
  1506  	fs.mu.Lock()
  1507  	defer fs.mu.Unlock()
  1508  
  1509  	// Check for any left over purged messages.
  1510  	<-dios
  1511  	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
  1512  	if _, err := os.Stat(pdir); err == nil {
  1513  		os.RemoveAll(pdir)
  1514  	}
  1515  	// Grab our stream state file and load it in.
  1516  	fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
  1517  	buf, err := os.ReadFile(fn)
  1518  	dios <- struct{}{}
  1519  
  1520  	if err != nil {
  1521  		if !os.IsNotExist(err) {
  1522  			fs.warn("Could not read stream state file: %v", err)
  1523  		}
  1524  		return err
  1525  	}
  1526  
  1527  	const minLen = 32
  1528  	if len(buf) < minLen {
  1529  		os.Remove(fn)
  1530  		fs.warn("Stream state too short (%d bytes)", len(buf))
  1531  		return errCorruptState
  1532  	}
  1533  
  1534  	// The highwayhash will be on the end. Check that it still matches.
  1535  	h := buf[len(buf)-highwayhash.Size64:]
  1536  	buf = buf[:len(buf)-highwayhash.Size64]
  1537  	fs.hh.Reset()
  1538  	fs.hh.Write(buf)
  1539  	if !bytes.Equal(h, fs.hh.Sum(nil)) {
  1540  		os.Remove(fn)
  1541  		fs.warn("Stream state checksum did not match")
  1542  		return errCorruptState
  1543  	}
  1544  
  1545  	// Decrypt if needed.
  1546  	if fs.prf != nil {
  1547  		// We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile
  1548  		// since snapshots strip encryption.
  1549  		if err := fs.recoverAEK(); err == nil {
  1550  			ns := fs.aek.NonceSize()
  1551  			buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil)
  1552  			if err != nil {
  1553  				fs.warn("Stream state error reading encryption key: %v", err)
  1554  				return err
  1555  			}
  1556  		}
  1557  	}
  1558  
  1559  	if buf[0] != fullStateMagic || buf[1] != fullStateVersion {
  1560  		os.Remove(fn)
  1561  		fs.warn("Stream state magic and version mismatch")
  1562  		return errCorruptState
  1563  	}
  1564  
  1565  	bi := hdrLen
  1566  
  1567  	readU64 := func() uint64 {
  1568  		if bi < 0 {
  1569  			return 0
  1570  		}
  1571  		v, n := binary.Uvarint(buf[bi:])
  1572  		if n <= 0 {
  1573  			bi = -1
  1574  			return 0
  1575  		}
  1576  		bi += n
  1577  		return v
  1578  	}
  1579  	readI64 := func() int64 {
  1580  		if bi < 0 {
  1581  			return 0
  1582  		}
  1583  		v, n := binary.Varint(buf[bi:])
  1584  		if n <= 0 {
  1585  			bi = -1
  1586  			return -1
  1587  		}
  1588  		bi += n
  1589  		return v
  1590  	}
  1591  
  1592  	setTime := func(t *time.Time, ts int64) {
  1593  		if ts == 0 {
  1594  			*t = time.Time{}
  1595  		} else {
  1596  			*t = time.Unix(0, ts).UTC()
  1597  		}
  1598  	}
  1599  
  1600  	var state StreamState
  1601  	state.Msgs = readU64()
  1602  	state.Bytes = readU64()
  1603  	state.FirstSeq = readU64()
  1604  	baseTime := readI64()
  1605  	setTime(&state.FirstTime, baseTime)
  1606  	state.LastSeq = readU64()
  1607  	setTime(&state.LastTime, readI64())
  1608  
  1609  	// Check for per subject info.
  1610  	if numSubjects := int(readU64()); numSubjects > 0 {
  1611  		fs.psim, fs.tsl = fs.psim.Empty(), 0
  1612  		for i := 0; i < numSubjects; i++ {
  1613  			if lsubj := int(readU64()); lsubj > 0 {
  1614  				if bi+lsubj > len(buf) {
  1615  					os.Remove(fn)
  1616  					fs.warn("Stream state bad subject len (%d)", lsubj)
  1617  					return errCorruptState
  1618  				}
  1619  				// If we have lots of subjects this will alloc for each one.
  1620  				// We could reference the underlying buffer, but we could guess wrong if
  1621  				// number of blocks is large and subjects is low, since we would reference buf.
  1622  				subj := buf[bi : bi+lsubj]
  1623  				// We had a bug that could cause memory corruption in the PSIM that could have gotten stored to disk.
  1624  				// Only would affect subjects, so do quick check.
  1625  				if !isValidSubject(string(subj), true) {
  1626  					os.Remove(fn)
  1627  					fs.warn("Stream state corrupt subject detected")
  1628  					return errCorruptState
  1629  				}
  1630  				bi += lsubj
  1631  				psi := psi{total: readU64(), fblk: uint32(readU64())}
  1632  				if psi.total > 1 {
  1633  					psi.lblk = uint32(readU64())
  1634  				} else {
  1635  					psi.lblk = psi.fblk
  1636  				}
  1637  				fs.psim.Insert(subj, psi)
  1638  				fs.tsl += lsubj
  1639  			}
  1640  		}
  1641  	}
  1642  
  1643  	// Track the state as represented by the blocks themselves.
  1644  	var mstate StreamState
  1645  
  1646  	if numBlocks := readU64(); numBlocks > 0 {
  1647  		lastIndex := int(numBlocks - 1)
  1648  		fs.blks = make([]*msgBlock, 0, numBlocks)
  1649  		for i := 0; i < int(numBlocks); i++ {
  1650  			index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64()
  1651  			if bi < 0 {
  1652  				break
  1653  			}
  1654  			mb := fs.initMsgBlock(index)
  1655  			atomic.StoreUint64(&mb.first.seq, fseq)
  1656  			atomic.StoreUint64(&mb.last.seq, lseq)
  1657  			mb.msgs, mb.bytes = lseq-fseq+1, nbytes
  1658  			mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime
  1659  			if numDeleted > 0 {
  1660  				dmap, n, err := avl.Decode(buf[bi:])
  1661  				if err != nil {
  1662  					os.Remove(fn)
  1663  					fs.warn("Stream state error decoding avl dmap: %v", err)
  1664  					return errCorruptState
  1665  				}
  1666  				mb.dmap = *dmap
  1667  				if mb.msgs > numDeleted {
  1668  					mb.msgs -= numDeleted
  1669  				} else {
  1670  					mb.msgs = 0
  1671  				}
  1672  				bi += n
  1673  			}
  1674  			// Only add in if not empty or the lmb.
  1675  			if mb.msgs > 0 || i == lastIndex {
  1676  				fs.addMsgBlock(mb)
  1677  				updateTrackingState(&mstate, mb)
  1678  			} else {
  1679  				// Mark dirty to cleanup.
  1680  				fs.dirty++
  1681  			}
  1682  		}
  1683  	}
  1684  
  1685  	// Pull in last block index for the block that had last checksum when we wrote the full state.
  1686  	blkIndex := uint32(readU64())
  1687  	var lchk [8]byte
  1688  	if bi+len(lchk) > len(buf) {
  1689  		bi = -1
  1690  	} else {
  1691  		copy(lchk[0:], buf[bi:bi+len(lchk)])
  1692  	}
  1693  
  1694  	// Check if we had any errors.
  1695  	if bi < 0 {
  1696  		os.Remove(fn)
  1697  		fs.warn("Stream state has no checksum present")
  1698  		return errCorruptState
  1699  	}
  1700  
  1701  	// Move into place our state, msgBlks and subject info.
  1702  	fs.state = state
  1703  
  1704  	// First let's check the happy path, open the blk file that was the lmb when we created the full state.
  1705  	// See if we have the last block available.
  1706  	var matched bool
  1707  	mb := fs.lmb
  1708  	if mb == nil || mb.index != blkIndex {
  1709  		fs.warn("Stream state block does not exist or index mismatch")
  1710  		return errCorruptState
  1711  	}
  1712  	if _, err := os.Stat(mb.mfn); err != nil && os.IsNotExist(err) {
  1713  		// If our saved state is past what we see on disk, fallback and rebuild.
  1714  		if ld, _, _ := mb.rebuildState(); ld != nil {
  1715  			fs.addLostData(ld)
  1716  		}
  1717  		fs.warn("Stream state detected prior state, could not locate msg block %d", blkIndex)
  1718  		return errPriorState
  1719  	}
  1720  	if matched = bytes.Equal(mb.lastChecksum(), lchk[:]); !matched {
  1721  		// Remove the last message block since recover will add in the new one.
  1722  		fs.removeMsgBlockFromList(mb)
  1723  		// Reverse update of tracking state for this mb, will add new state in below.
  1724  		mstate.Msgs -= mb.msgs
  1725  		mstate.Bytes -= mb.bytes
  1726  		if nmb, err := fs.recoverMsgBlockNoSubjectUpdates(mb.index); err != nil && !os.IsNotExist(err) {
  1727  			fs.warn("Stream state could not recover last msg block")
  1728  			os.Remove(fn)
  1729  			return errCorruptState
  1730  		} else if nmb != nil {
  1731  			fs.adjustAccounting(mb, nmb)
  1732  			updateTrackingState(&mstate, mb)
  1733  		}
  1734  	}
  1735  
  1736  	// On success double check our state.
  1737  	checkState := func() error {
  1738  		// We check first and last seq and number of msgs and bytes. If there is a difference,
  1739  		// return and error so we rebuild from the message block state on disk.
  1740  		if !trackingStatesEqual(&fs.state, &mstate) {
  1741  			fs.warn("Stream state encountered internal inconsistency on recover")
  1742  			os.Remove(fn)
  1743  			return errCorruptState
  1744  		}
  1745  		return nil
  1746  	}
  1747  
  1748  	// We may need to check other blocks. Even if we matched last checksum we will see if there is another block.
  1749  	for bi := blkIndex + 1; ; bi++ {
  1750  		nmb, err := fs.recoverMsgBlock(bi)
  1751  		if err != nil {
  1752  			if os.IsNotExist(err) {
  1753  				return checkState()
  1754  			}
  1755  			os.Remove(fn)
  1756  			fs.warn("Stream state could not recover msg block %d", bi)
  1757  			return err
  1758  		}
  1759  		if nmb != nil {
  1760  			// Update top level accounting
  1761  			if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq {
  1762  				fs.state.FirstSeq = fseq
  1763  				fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC()
  1764  			}
  1765  			if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq {
  1766  				fs.state.LastSeq = lseq
  1767  				fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC()
  1768  			}
  1769  			fs.state.Msgs += nmb.msgs
  1770  			fs.state.Bytes += nmb.bytes
  1771  			updateTrackingState(&mstate, nmb)
  1772  		}
  1773  	}
  1774  }
  1775  
  1776  // adjustAccounting will be called when a stream state was only partially accounted for
  1777  // within a message block, e.g. additional records were added after the stream state.
  1778  // Lock should be held.
  1779  func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) {
  1780  	nmb.mu.Lock()
  1781  	defer nmb.mu.Unlock()
  1782  
  1783  	// First make sure the new block is loaded.
  1784  	if nmb.cacheNotLoaded() {
  1785  		nmb.loadMsgsWithLock()
  1786  	}
  1787  	nmb.ensurePerSubjectInfoLoaded()
  1788  
  1789  	// Walk only new messages and update accounting at fs level. Any messages that should have
  1790  	// triggered limits exceeded will be handled after the recovery and prior to the stream
  1791  	// being available to the system.
  1792  	var smv StoreMsg
  1793  	for seq, lseq := atomic.LoadUint64(&mb.last.seq)+1, atomic.LoadUint64(&nmb.last.seq); seq <= lseq; seq++ {
  1794  		// Lookup the message. If an error will be deleted, so can skip.
  1795  		sm, err := nmb.cacheLookup(seq, &smv)
  1796  		if err != nil {
  1797  			continue
  1798  		}
  1799  		// Since we found it we just need to adjust fs totals and psim.
  1800  		fs.state.Msgs++
  1801  		fs.state.Bytes += fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  1802  		if len(sm.subj) > 0 && fs.psim != nil {
  1803  			if info, ok := fs.psim.Find(stringToBytes(sm.subj)); ok {
  1804  				info.total++
  1805  				if nmb.index > info.lblk {
  1806  					info.lblk = nmb.index
  1807  				}
  1808  			} else {
  1809  				fs.psim.Insert(stringToBytes(sm.subj), psi{total: 1, fblk: nmb.index, lblk: nmb.index})
  1810  				fs.tsl += len(sm.subj)
  1811  			}
  1812  		}
  1813  	}
  1814  
  1815  	// Now check to see if we had a higher first for the recovered state mb vs nmb.
  1816  	if atomic.LoadUint64(&nmb.first.seq) < atomic.LoadUint64(&mb.first.seq) {
  1817  		// Now set first for nmb.
  1818  		atomic.StoreUint64(&nmb.first.seq, atomic.LoadUint64(&mb.first.seq))
  1819  	}
  1820  
  1821  	// Update top level accounting.
  1822  	if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq {
  1823  		fs.state.FirstSeq = fseq
  1824  		fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC()
  1825  	}
  1826  	if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq {
  1827  		fs.state.LastSeq = lseq
  1828  		fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC()
  1829  	}
  1830  }
  1831  
  1832  // Grabs last checksum for the named block file.
  1833  // Takes into account encryption etc.
  1834  func (mb *msgBlock) lastChecksum() []byte {
  1835  	f, err := mb.openBlock()
  1836  	if err != nil {
  1837  		return nil
  1838  	}
  1839  	defer f.Close()
  1840  
  1841  	var lchk [8]byte
  1842  	if fi, _ := f.Stat(); fi != nil {
  1843  		mb.rbytes = uint64(fi.Size())
  1844  	}
  1845  	if mb.rbytes < checksumSize {
  1846  		return nil
  1847  	}
  1848  	// Encrypted?
  1849  	// Check for encryption, we do not load keys on startup anymore so might need to load them here.
  1850  	if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
  1851  		if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
  1852  			return nil
  1853  		}
  1854  	}
  1855  	if mb.bek != nil {
  1856  		if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize {
  1857  			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  1858  			if err != nil {
  1859  				return nil
  1860  			}
  1861  			mb.bek = bek
  1862  			mb.bek.XORKeyStream(buf, buf)
  1863  			copy(lchk[0:], buf[len(buf)-checksumSize:])
  1864  		}
  1865  	} else {
  1866  		f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize)
  1867  	}
  1868  	return lchk[:]
  1869  }
  1870  
  1871  // This will make sure we clean up old idx and fss files.
  1872  func (fs *fileStore) cleanupOldMeta() {
  1873  	fs.mu.RLock()
  1874  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  1875  	fs.mu.RUnlock()
  1876  
  1877  	<-dios
  1878  	f, err := os.Open(mdir)
  1879  	dios <- struct{}{}
  1880  	if err != nil {
  1881  		return
  1882  	}
  1883  
  1884  	dirs, _ := f.ReadDir(-1)
  1885  	f.Close()
  1886  
  1887  	const (
  1888  		minLen    = 4
  1889  		idxSuffix = ".idx"
  1890  		fssSuffix = ".fss"
  1891  	)
  1892  	for _, fi := range dirs {
  1893  		if name := fi.Name(); strings.HasSuffix(name, idxSuffix) || strings.HasSuffix(name, fssSuffix) {
  1894  			os.Remove(filepath.Join(mdir, name))
  1895  		}
  1896  	}
  1897  }
  1898  
  1899  func (fs *fileStore) recoverMsgs() error {
  1900  	fs.mu.Lock()
  1901  	defer fs.mu.Unlock()
  1902  
  1903  	// Check for any left over purged messages.
  1904  	<-dios
  1905  	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
  1906  	if _, err := os.Stat(pdir); err == nil {
  1907  		os.RemoveAll(pdir)
  1908  	}
  1909  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  1910  	f, err := os.Open(mdir)
  1911  	if err != nil {
  1912  		dios <- struct{}{}
  1913  		return errNotReadable
  1914  	}
  1915  	dirs, err := f.ReadDir(-1)
  1916  	f.Close()
  1917  	dios <- struct{}{}
  1918  
  1919  	if err != nil {
  1920  		return errNotReadable
  1921  	}
  1922  
  1923  	indices := make(sort.IntSlice, 0, len(dirs))
  1924  	var index int
  1925  	for _, fi := range dirs {
  1926  		if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 {
  1927  			indices = append(indices, index)
  1928  		}
  1929  	}
  1930  	indices.Sort()
  1931  
  1932  	// Recover all of the msg blocks.
  1933  	// We now guarantee they are coming in order.
  1934  	for _, index := range indices {
  1935  		if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil {
  1936  			// This is a truncate block with possibly no index. If the OS got shutdown
  1937  			// out from underneath of us this is possible.
  1938  			if mb.first.seq == 0 {
  1939  				mb.dirtyCloseWithRemove(true)
  1940  				fs.removeMsgBlockFromList(mb)
  1941  				continue
  1942  			}
  1943  			if fseq := atomic.LoadUint64(&mb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq {
  1944  				fs.state.FirstSeq = fseq
  1945  				if mb.first.ts == 0 {
  1946  					fs.state.FirstTime = time.Time{}
  1947  				} else {
  1948  					fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  1949  				}
  1950  			}
  1951  			if lseq := atomic.LoadUint64(&mb.last.seq); lseq > fs.state.LastSeq {
  1952  				fs.state.LastSeq = lseq
  1953  				if mb.last.ts == 0 {
  1954  					fs.state.LastTime = time.Time{}
  1955  				} else {
  1956  					fs.state.LastTime = time.Unix(0, mb.last.ts).UTC()
  1957  				}
  1958  			}
  1959  			fs.state.Msgs += mb.msgs
  1960  			fs.state.Bytes += mb.bytes
  1961  		} else {
  1962  			return err
  1963  		}
  1964  	}
  1965  
  1966  	if len(fs.blks) > 0 {
  1967  		fs.lmb = fs.blks[len(fs.blks)-1]
  1968  	} else {
  1969  		_, err = fs.newMsgBlockForWrite()
  1970  	}
  1971  
  1972  	// Check if we encountered any lost data.
  1973  	if fs.ld != nil {
  1974  		var emptyBlks []*msgBlock
  1975  		for _, mb := range fs.blks {
  1976  			if mb.msgs == 0 && mb.rbytes == 0 {
  1977  				emptyBlks = append(emptyBlks, mb)
  1978  			}
  1979  		}
  1980  		for _, mb := range emptyBlks {
  1981  			// Need the mb lock here.
  1982  			mb.mu.Lock()
  1983  			fs.removeMsgBlock(mb)
  1984  			mb.mu.Unlock()
  1985  		}
  1986  	}
  1987  
  1988  	if err != nil {
  1989  		return err
  1990  	}
  1991  
  1992  	// Check for keyfiles orphans.
  1993  	if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 {
  1994  		valid := make(map[uint32]bool)
  1995  		for _, mb := range fs.blks {
  1996  			valid[mb.index] = true
  1997  		}
  1998  		for _, fn := range kms {
  1999  			var index uint32
  2000  			shouldRemove := true
  2001  			if n, err := fmt.Sscanf(filepath.Base(fn), keyScan, &index); err == nil && n == 1 && valid[index] {
  2002  				shouldRemove = false
  2003  			}
  2004  			if shouldRemove {
  2005  				os.Remove(fn)
  2006  			}
  2007  		}
  2008  	}
  2009  
  2010  	return nil
  2011  }
  2012  
  2013  // Will expire msgs that have aged out on restart.
  2014  // We will treat this differently in case we have a recovery
  2015  // that will expire alot of messages on startup.
  2016  // Should only be called on startup.
  2017  func (fs *fileStore) expireMsgsOnRecover() {
  2018  	if fs.state.Msgs == 0 {
  2019  		return
  2020  	}
  2021  
  2022  	var minAge = time.Now().UnixNano() - int64(fs.cfg.MaxAge)
  2023  	var purged, bytes uint64
  2024  	var deleted int
  2025  	var nts int64
  2026  
  2027  	// If we expire all make sure to write out a tombstone. Need to be done by hand here,
  2028  	// usually taken care of by fs.removeMsgBlock() but we do not call that here.
  2029  	var last msgId
  2030  
  2031  	deleteEmptyBlock := func(mb *msgBlock) {
  2032  		// If we are the last keep state to remember first/last sequence.
  2033  		// Do this part by hand since not deleting one by one.
  2034  		if mb == fs.lmb {
  2035  			last.seq = atomic.LoadUint64(&mb.last.seq)
  2036  			last.ts = mb.last.ts
  2037  		}
  2038  		// Make sure we do subject cleanup as well.
  2039  		mb.ensurePerSubjectInfoLoaded()
  2040  		for subj, ss := range mb.fss {
  2041  			for i := uint64(0); i < ss.Msgs; i++ {
  2042  				fs.removePerSubject(subj)
  2043  			}
  2044  		}
  2045  		mb.dirtyCloseWithRemove(true)
  2046  		deleted++
  2047  	}
  2048  
  2049  	for _, mb := range fs.blks {
  2050  		mb.mu.Lock()
  2051  		if minAge < mb.first.ts {
  2052  			nts = mb.first.ts
  2053  			mb.mu.Unlock()
  2054  			break
  2055  		}
  2056  		// Can we remove whole block here?
  2057  		if mb.last.ts <= minAge {
  2058  			purged += mb.msgs
  2059  			bytes += mb.bytes
  2060  			deleteEmptyBlock(mb)
  2061  			mb.mu.Unlock()
  2062  			continue
  2063  		}
  2064  
  2065  		// If we are here we have to process the interior messages of this blk.
  2066  		// This will load fss as well.
  2067  		if err := mb.loadMsgsWithLock(); err != nil {
  2068  			mb.mu.Unlock()
  2069  			break
  2070  		}
  2071  
  2072  		var smv StoreMsg
  2073  		var needNextFirst bool
  2074  
  2075  		// Walk messages and remove if expired.
  2076  		fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  2077  		for seq := fseq; seq <= lseq; seq++ {
  2078  			sm, err := mb.cacheLookup(seq, &smv)
  2079  			// Process interior deleted msgs.
  2080  			if err == errDeletedMsg {
  2081  				// Update dmap.
  2082  				if mb.dmap.Exists(seq) {
  2083  					mb.dmap.Delete(seq)
  2084  				}
  2085  				// Keep this updated just in case since we are removing dmap entries.
  2086  				atomic.StoreUint64(&mb.first.seq, seq)
  2087  				needNextFirst = true
  2088  				continue
  2089  			}
  2090  			// Break on other errors.
  2091  			if err != nil || sm == nil {
  2092  				atomic.StoreUint64(&mb.first.seq, seq)
  2093  				needNextFirst = true
  2094  				break
  2095  			}
  2096  
  2097  			// No error and sm != nil from here onward.
  2098  
  2099  			// Check for done.
  2100  			if minAge < sm.ts {
  2101  				atomic.StoreUint64(&mb.first.seq, sm.seq)
  2102  				mb.first.ts = sm.ts
  2103  				needNextFirst = false
  2104  				nts = sm.ts
  2105  				break
  2106  			}
  2107  
  2108  			// Delete the message here.
  2109  			if mb.msgs > 0 {
  2110  				atomic.StoreUint64(&mb.first.seq, seq)
  2111  				needNextFirst = true
  2112  				sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  2113  				if sz > mb.bytes {
  2114  					sz = mb.bytes
  2115  				}
  2116  				mb.bytes -= sz
  2117  				bytes += sz
  2118  				mb.msgs--
  2119  				purged++
  2120  			}
  2121  			// Update fss
  2122  			// Make sure we have fss loaded.
  2123  			mb.removeSeqPerSubject(sm.subj, seq)
  2124  			fs.removePerSubject(sm.subj)
  2125  		}
  2126  		// Make sure we have a proper next first sequence.
  2127  		if needNextFirst {
  2128  			mb.selectNextFirst()
  2129  		}
  2130  		// Check if empty after processing, could happen if tail of messages are all deleted.
  2131  		if mb.msgs == 0 {
  2132  			deleteEmptyBlock(mb)
  2133  		}
  2134  		mb.mu.Unlock()
  2135  		break
  2136  	}
  2137  
  2138  	if nts > 0 {
  2139  		// Make sure to set age check based on this value.
  2140  		fs.resetAgeChk(nts - minAge)
  2141  	}
  2142  
  2143  	if deleted > 0 {
  2144  		// Update block map.
  2145  		if fs.bim != nil {
  2146  			for _, mb := range fs.blks[:deleted] {
  2147  				delete(fs.bim, mb.index)
  2148  			}
  2149  		}
  2150  		// Update blks slice.
  2151  		fs.blks = copyMsgBlocks(fs.blks[deleted:])
  2152  		if lb := len(fs.blks); lb == 0 {
  2153  			fs.lmb = nil
  2154  		} else {
  2155  			fs.lmb = fs.blks[lb-1]
  2156  		}
  2157  	}
  2158  	// Update top level accounting.
  2159  	if purged < fs.state.Msgs {
  2160  		fs.state.Msgs -= purged
  2161  	} else {
  2162  		fs.state.Msgs = 0
  2163  	}
  2164  	if bytes < fs.state.Bytes {
  2165  		fs.state.Bytes -= bytes
  2166  	} else {
  2167  		fs.state.Bytes = 0
  2168  	}
  2169  	// Make sure to we properly set the fs first sequence and timestamp.
  2170  	fs.selectNextFirst()
  2171  
  2172  	// Check if we have no messages and blocks left.
  2173  	if fs.lmb == nil && last.seq != 0 {
  2174  		if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
  2175  			lmb.writeTombstone(last.seq, last.ts)
  2176  		}
  2177  		// Clear any global subject state.
  2178  		fs.psim, fs.tsl = fs.psim.Empty(), 0
  2179  	}
  2180  
  2181  	// If we purged anything, make sure we kick flush state loop.
  2182  	if purged > 0 {
  2183  		fs.dirty++
  2184  		fs.kickFlushStateLoop()
  2185  	}
  2186  }
  2187  
  2188  func copyMsgBlocks(src []*msgBlock) []*msgBlock {
  2189  	if src == nil {
  2190  		return nil
  2191  	}
  2192  	dst := make([]*msgBlock, len(src))
  2193  	copy(dst, src)
  2194  	return dst
  2195  }
  2196  
  2197  // GetSeqFromTime looks for the first sequence number that has
  2198  // the message with >= timestamp.
  2199  // FIXME(dlc) - inefficient, and dumb really. Make this better.
  2200  func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 {
  2201  	fs.mu.RLock()
  2202  	lastSeq := fs.state.LastSeq
  2203  	closed := fs.closed
  2204  	fs.mu.RUnlock()
  2205  
  2206  	if closed {
  2207  		return 0
  2208  	}
  2209  
  2210  	mb := fs.selectMsgBlockForStart(t)
  2211  	if mb == nil {
  2212  		return lastSeq + 1
  2213  	}
  2214  
  2215  	fseq := atomic.LoadUint64(&mb.first.seq)
  2216  	lseq := atomic.LoadUint64(&mb.last.seq)
  2217  
  2218  	var smv StoreMsg
  2219  
  2220  	// Linear search, hence the dumb part..
  2221  	ts := t.UnixNano()
  2222  	for seq := fseq; seq <= lseq; seq++ {
  2223  		sm, _, _ := mb.fetchMsg(seq, &smv)
  2224  		if sm != nil && sm.ts >= ts {
  2225  			return sm.seq
  2226  		}
  2227  	}
  2228  	return 0
  2229  }
  2230  
  2231  // Find the first matching message.
  2232  func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
  2233  	mb.mu.Lock()
  2234  	defer mb.mu.Unlock()
  2235  
  2236  	fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter}
  2237  
  2238  	var didLoad bool
  2239  	if mb.fssNotLoaded() {
  2240  		// Make sure we have fss loaded.
  2241  		mb.loadMsgsWithLock()
  2242  		didLoad = true
  2243  	}
  2244  
  2245  	// If we only have 1 subject currently and it matches our filter we can also set isAll.
  2246  	if !isAll && len(mb.fss) == 1 {
  2247  		_, isAll = mb.fss[filter]
  2248  	}
  2249  	// Make sure to start at mb.first.seq if fseq < mb.first.seq
  2250  	if seq := atomic.LoadUint64(&mb.first.seq); seq > fseq {
  2251  		fseq = seq
  2252  	}
  2253  	lseq := atomic.LoadUint64(&mb.last.seq)
  2254  
  2255  	// Optionally build the isMatch for wildcard filters.
  2256  	tsa := [32]string{}
  2257  	fsa := [32]string{}
  2258  	var fts []string
  2259  	var isMatch func(subj string) bool
  2260  	// Decide to build.
  2261  	if wc {
  2262  		fts = tokenizeSubjectIntoSlice(fsa[:0], filter)
  2263  		isMatch = func(subj string) bool {
  2264  			tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
  2265  			return isSubsetMatchTokenized(tts, fts)
  2266  		}
  2267  	}
  2268  	// Only do linear scan if isAll or we are wildcarded and have to traverse more fss than actual messages.
  2269  	doLinearScan := isAll || (wc && len(mb.fss) > int(lseq-fseq))
  2270  	if !doLinearScan {
  2271  		// If we have a wildcard match against all tracked subjects we know about.
  2272  		if wc {
  2273  			subs = subs[:0]
  2274  			for subj := range mb.fss {
  2275  				if isMatch(subj) {
  2276  					subs = append(subs, subj)
  2277  				}
  2278  			}
  2279  		}
  2280  		fseq = lseq + 1
  2281  		for _, subj := range subs {
  2282  			ss := mb.fss[subj]
  2283  			if ss != nil && ss.firstNeedsUpdate {
  2284  				mb.recalculateFirstForSubj(subj, ss.First, ss)
  2285  			}
  2286  			if ss == nil || start > ss.Last || ss.First >= fseq {
  2287  				continue
  2288  			}
  2289  			if ss.First < start {
  2290  				fseq = start
  2291  			} else {
  2292  				fseq = ss.First
  2293  			}
  2294  		}
  2295  	}
  2296  
  2297  	// If we guess to not do a linear scan, but the above resulted in alot of subs that will
  2298  	// need to be checked for every scanned message, revert.
  2299  	// TODO(dlc) - we could memoize the subs across calls.
  2300  	if len(subs) > int(lseq-fseq) {
  2301  		doLinearScan = true
  2302  	}
  2303  
  2304  	if fseq > lseq {
  2305  		return nil, didLoad, ErrStoreMsgNotFound
  2306  	}
  2307  
  2308  	// Need messages loaded from here on out.
  2309  	if mb.cacheNotLoaded() {
  2310  		if err := mb.loadMsgsWithLock(); err != nil {
  2311  			return nil, false, err
  2312  		}
  2313  		didLoad = true
  2314  	}
  2315  
  2316  	if sm == nil {
  2317  		sm = new(StoreMsg)
  2318  	}
  2319  
  2320  	for seq := fseq; seq <= lseq; seq++ {
  2321  		llseq := mb.llseq
  2322  		fsm, err := mb.cacheLookup(seq, sm)
  2323  		if err != nil {
  2324  			continue
  2325  		}
  2326  		expireOk := seq == lseq && mb.llseq == seq
  2327  		if isAll {
  2328  			return fsm, expireOk, nil
  2329  		}
  2330  		if doLinearScan {
  2331  			if wc && isMatch(sm.subj) {
  2332  				return fsm, expireOk, nil
  2333  			} else if !wc && fsm.subj == filter {
  2334  				return fsm, expireOk, nil
  2335  			}
  2336  		} else {
  2337  			for _, subj := range subs {
  2338  				if fsm.subj == subj {
  2339  					return fsm, expireOk, nil
  2340  				}
  2341  			}
  2342  		}
  2343  		// If we are here we did not match, so put the llseq back.
  2344  		mb.llseq = llseq
  2345  	}
  2346  
  2347  	return nil, didLoad, ErrStoreMsgNotFound
  2348  }
  2349  
  2350  // This will traverse a message block and generate the filtered pending.
  2351  func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) {
  2352  	mb.mu.Lock()
  2353  	defer mb.mu.Unlock()
  2354  	return mb.filteredPendingLocked(subj, wc, seq)
  2355  }
  2356  
  2357  // This will traverse a message block and generate the filtered pending.
  2358  // Lock should be held.
  2359  func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) {
  2360  	isAll := filter == _EMPTY_ || filter == fwcs
  2361  
  2362  	// First check if we can optimize this part.
  2363  	// This means we want all and the starting sequence was before this block.
  2364  	if isAll {
  2365  		if fseq := atomic.LoadUint64(&mb.first.seq); sseq <= fseq {
  2366  			return mb.msgs, fseq, atomic.LoadUint64(&mb.last.seq)
  2367  		}
  2368  	}
  2369  
  2370  	update := func(ss *SimpleState) {
  2371  		total += ss.Msgs
  2372  		if first == 0 || ss.First < first {
  2373  			first = ss.First
  2374  		}
  2375  		if ss.Last > last {
  2376  			last = ss.Last
  2377  		}
  2378  	}
  2379  
  2380  	// Make sure we have fss loaded.
  2381  	mb.ensurePerSubjectInfoLoaded()
  2382  
  2383  	tsa := [32]string{}
  2384  	fsa := [32]string{}
  2385  	fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
  2386  
  2387  	// 1. See if we match any subs from fss.
  2388  	// 2. If we match and the sseq is past ss.Last then we can use meta only.
  2389  	// 3. If we match and we need to do a partial, break and clear any totals and do a full scan like num pending.
  2390  
  2391  	isMatch := func(subj string) bool {
  2392  		if !wc {
  2393  			return subj == filter
  2394  		}
  2395  		tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
  2396  		return isSubsetMatchTokenized(tts, fts)
  2397  	}
  2398  
  2399  	var havePartial bool
  2400  	for subj, ss := range mb.fss {
  2401  		if isAll || isMatch(subj) {
  2402  			if ss.firstNeedsUpdate {
  2403  				mb.recalculateFirstForSubj(subj, ss.First, ss)
  2404  			}
  2405  			if sseq <= ss.First {
  2406  				update(ss)
  2407  			} else if sseq <= ss.Last {
  2408  				// We matched but its a partial.
  2409  				havePartial = true
  2410  				break
  2411  			}
  2412  		}
  2413  	}
  2414  
  2415  	// If we did not encounter any partials we can return here.
  2416  	if !havePartial {
  2417  		return total, first, last
  2418  	}
  2419  
  2420  	// If we are here we need to scan the msgs.
  2421  	// Clear what we had.
  2422  	total, first, last = 0, 0, 0
  2423  
  2424  	// If we load the cache for a linear scan we want to expire that cache upon exit.
  2425  	var shouldExpire bool
  2426  	if mb.cacheNotLoaded() {
  2427  		mb.loadMsgsWithLock()
  2428  		shouldExpire = true
  2429  	}
  2430  
  2431  	var smv StoreMsg
  2432  	for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ {
  2433  		sm, _ := mb.cacheLookup(seq, &smv)
  2434  		if sm == nil {
  2435  			continue
  2436  		}
  2437  		if isAll || isMatch(sm.subj) {
  2438  			total++
  2439  			if first == 0 || seq < first {
  2440  				first = seq
  2441  			}
  2442  			if seq > last {
  2443  				last = seq
  2444  			}
  2445  		}
  2446  	}
  2447  	// If we loaded this block for this operation go ahead and expire it here.
  2448  	if shouldExpire {
  2449  		mb.tryForceExpireCacheLocked()
  2450  	}
  2451  
  2452  	return total, first, last
  2453  }
  2454  
  2455  // FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence.
  2456  func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState {
  2457  	fs.mu.RLock()
  2458  	defer fs.mu.RUnlock()
  2459  
  2460  	lseq := fs.state.LastSeq
  2461  	if sseq < fs.state.FirstSeq {
  2462  		sseq = fs.state.FirstSeq
  2463  	}
  2464  
  2465  	// Returned state.
  2466  	var ss SimpleState
  2467  
  2468  	// If past the end no results.
  2469  	if sseq > lseq {
  2470  		// Make sure we track sequences
  2471  		ss.First = fs.state.FirstSeq
  2472  		ss.Last = fs.state.LastSeq
  2473  		return ss
  2474  	}
  2475  
  2476  	// If we want all msgs that match we can shortcircuit.
  2477  	// TODO(dlc) - This can be extended for all cases but would
  2478  	// need to be careful on total msgs calculations etc.
  2479  	if sseq == fs.state.FirstSeq {
  2480  		fs.numFilteredPending(subj, &ss)
  2481  	} else {
  2482  		wc := subjectHasWildcard(subj)
  2483  		// Tracking subject state.
  2484  		// TODO(dlc) - Optimize for 2.10 with avl tree and no atomics per block.
  2485  		for _, mb := range fs.blks {
  2486  			// Skip blocks that are less than our starting sequence.
  2487  			if sseq > atomic.LoadUint64(&mb.last.seq) {
  2488  				continue
  2489  			}
  2490  			t, f, l := mb.filteredPending(subj, wc, sseq)
  2491  			ss.Msgs += t
  2492  			if ss.First == 0 || (f > 0 && f < ss.First) {
  2493  				ss.First = f
  2494  			}
  2495  			if l > ss.Last {
  2496  				ss.Last = l
  2497  			}
  2498  		}
  2499  	}
  2500  
  2501  	return ss
  2502  }
  2503  
  2504  // Optimized way for getting all num pending matching a filter subject.
  2505  // Lock should be held.
  2506  func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
  2507  	isAll := filter == _EMPTY_ || filter == fwcs
  2508  
  2509  	// If isAll we do not need to do anything special to calculate the first and last and total.
  2510  	if isAll {
  2511  		ss.First = fs.state.FirstSeq
  2512  		ss.Last = fs.state.LastSeq
  2513  		ss.Msgs = fs.state.Msgs
  2514  		return
  2515  	}
  2516  
  2517  	start, stop := uint32(math.MaxUint32), uint32(0)
  2518  	fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) {
  2519  		ss.Msgs += psi.total
  2520  		// Keep track of start and stop indexes for this subject.
  2521  		if psi.fblk < start {
  2522  			start = psi.fblk
  2523  		}
  2524  		if psi.lblk > stop {
  2525  			stop = psi.lblk
  2526  		}
  2527  	})
  2528  	// We do need to figure out the first and last sequences.
  2529  	wc := subjectHasWildcard(filter)
  2530  	// Do start
  2531  	mb := fs.bim[start]
  2532  	if mb != nil {
  2533  		_, f, _ := mb.filteredPending(filter, wc, 0)
  2534  		ss.First = f
  2535  	}
  2536  	if ss.First == 0 {
  2537  		// This is a miss. This can happen since psi.fblk is lazy, but should be very rare.
  2538  		for i := start + 1; i <= stop; i++ {
  2539  			mb := fs.bim[i]
  2540  			if mb == nil {
  2541  				continue
  2542  			}
  2543  			if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 {
  2544  				ss.First = f
  2545  				break
  2546  			}
  2547  		}
  2548  	}
  2549  	// Now last
  2550  	if mb = fs.bim[stop]; mb != nil {
  2551  		_, _, l := mb.filteredPending(filter, wc, 0)
  2552  		ss.Last = l
  2553  	}
  2554  }
  2555  
  2556  // SubjectsState returns a map of SimpleState for all matching subjects.
  2557  func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
  2558  	fs.mu.RLock()
  2559  	defer fs.mu.RUnlock()
  2560  
  2561  	if fs.state.Msgs == 0 || fs.noTrackSubjects() {
  2562  		return nil
  2563  	}
  2564  
  2565  	start, stop := fs.blks[0], fs.lmb
  2566  	// We can short circuit if not a wildcard using psim for start and stop.
  2567  	if !subjectHasWildcard(subject) {
  2568  		info, ok := fs.psim.Find(stringToBytes(subject))
  2569  		if !ok {
  2570  			return nil
  2571  		}
  2572  		start, stop = fs.bim[info.fblk], fs.bim[info.lblk]
  2573  	}
  2574  
  2575  	// Aggregate fss.
  2576  	fss := make(map[string]SimpleState)
  2577  	var startFound bool
  2578  
  2579  	for _, mb := range fs.blks {
  2580  		if !startFound {
  2581  			if mb != start {
  2582  				continue
  2583  			}
  2584  			startFound = true
  2585  		}
  2586  
  2587  		mb.mu.Lock()
  2588  		var shouldExpire bool
  2589  		if mb.fssNotLoaded() {
  2590  			// Make sure we have fss loaded.
  2591  			mb.loadMsgsWithLock()
  2592  			shouldExpire = true
  2593  		}
  2594  		for subj, ss := range mb.fss {
  2595  			if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) {
  2596  				if ss.firstNeedsUpdate {
  2597  					mb.recalculateFirstForSubj(subj, ss.First, ss)
  2598  				}
  2599  				oss := fss[subj]
  2600  				if oss.First == 0 { // New
  2601  					fss[subj] = *ss
  2602  				} else {
  2603  					// Merge here.
  2604  					oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs
  2605  					fss[subj] = oss
  2606  				}
  2607  			}
  2608  		}
  2609  		if shouldExpire {
  2610  			// Expire this cache before moving on.
  2611  			mb.tryForceExpireCacheLocked()
  2612  		}
  2613  		mb.mu.Unlock()
  2614  
  2615  		if mb == stop {
  2616  			break
  2617  		}
  2618  	}
  2619  
  2620  	return fss
  2621  }
  2622  
  2623  // MultiLastSeqs will return a sorted list of sequences that match all subjects presented in filters.
  2624  // We will not exceed the maxSeq, which if 0 becomes the store's last sequence.
  2625  func (fs *fileStore) MultiLastSeqs(filters []string, maxSeq uint64, maxAllowed int) ([]uint64, error) {
  2626  	fs.mu.RLock()
  2627  	defer fs.mu.RUnlock()
  2628  
  2629  	if fs.state.Msgs == 0 || fs.noTrackSubjects() {
  2630  		return nil, nil
  2631  	}
  2632  
  2633  	lastBlkIndex := len(fs.blks) - 1
  2634  	lastMB := fs.blks[lastBlkIndex]
  2635  
  2636  	// Implied last sequence.
  2637  	if maxSeq == 0 {
  2638  		maxSeq = fs.state.LastSeq
  2639  	} else {
  2640  		// Udate last mb index if not last seq.
  2641  		lastBlkIndex, lastMB = fs.selectMsgBlockWithIndex(maxSeq)
  2642  	}
  2643  	//Make sure non-nil
  2644  	if lastMB == nil {
  2645  		return nil, nil
  2646  	}
  2647  
  2648  	// Grab our last mb index (not same as blk index).
  2649  	lastMB.mu.RLock()
  2650  	lastMBIndex := lastMB.index
  2651  	lastMB.mu.RUnlock()
  2652  
  2653  	subs := make(map[string]*psi)
  2654  	ltSeen := make(map[string]uint32)
  2655  	for _, filter := range filters {
  2656  		fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) {
  2657  			s := string(subj)
  2658  			subs[s] = psi
  2659  			if psi.lblk < lastMBIndex {
  2660  				ltSeen[s] = psi.lblk
  2661  			}
  2662  		})
  2663  	}
  2664  
  2665  	// If all subjects have a lower last index, select the largest for our walk backwards.
  2666  	if len(ltSeen) == len(subs) {
  2667  		max := uint32(0)
  2668  		for _, mbi := range ltSeen {
  2669  			if mbi > max {
  2670  				max = mbi
  2671  			}
  2672  		}
  2673  		lastMB = fs.bim[max]
  2674  	}
  2675  
  2676  	// Collect all sequences needed.
  2677  	seqs := make([]uint64, 0, len(subs))
  2678  	for i, lnf := lastBlkIndex, false; i >= 0; i-- {
  2679  		if len(subs) == 0 {
  2680  			break
  2681  		}
  2682  		mb := fs.blks[i]
  2683  		if !lnf {
  2684  			if mb != lastMB {
  2685  				continue
  2686  			}
  2687  			lnf = true
  2688  		}
  2689  		// We can start properly looking here.
  2690  		mb.mu.Lock()
  2691  		mb.ensurePerSubjectInfoLoaded()
  2692  		for subj, psi := range subs {
  2693  			if ss := mb.fss[subj]; ss != nil {
  2694  				if ss.Last <= maxSeq {
  2695  					seqs = append(seqs, ss.Last)
  2696  					delete(subs, subj)
  2697  				} else {
  2698  					// Need to search for it since last is > maxSeq.
  2699  					if mb.cacheNotLoaded() {
  2700  						mb.loadMsgsWithLock()
  2701  					}
  2702  					var smv StoreMsg
  2703  					fseq := atomic.LoadUint64(&mb.first.seq)
  2704  					for seq := maxSeq; seq >= fseq; seq-- {
  2705  						sm, _ := mb.cacheLookup(seq, &smv)
  2706  						if sm == nil || sm.subj != subj {
  2707  							continue
  2708  						}
  2709  						seqs = append(seqs, sm.seq)
  2710  						delete(subs, subj)
  2711  						break
  2712  					}
  2713  				}
  2714  			} else if mb.index <= psi.fblk {
  2715  				// Track which subs are no longer applicable, meaning we will not find a valid msg at this point.
  2716  				delete(subs, subj)
  2717  			}
  2718  			// TODO(dlc) we could track lblk like above in case some subs are very far apart.
  2719  			// Not too bad if fss loaded since we will skip over quickly with it loaded, but might be worth it.
  2720  		}
  2721  		mb.mu.Unlock()
  2722  
  2723  		// If maxAllowed was sepcified check that we will not exceed that.
  2724  		if maxAllowed > 0 && len(seqs) > maxAllowed {
  2725  			return nil, ErrTooManyResults
  2726  		}
  2727  
  2728  	}
  2729  	if len(seqs) == 0 {
  2730  		return nil, nil
  2731  	}
  2732  	sort.Slice(seqs, func(i, j int) bool { return seqs[i] < seqs[j] })
  2733  	return seqs, nil
  2734  }
  2735  
  2736  // NumPending will return the number of pending messages matching the filter subject starting at sequence.
  2737  // Optimized for stream num pending calculations for consumers.
  2738  func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) {
  2739  	fs.mu.RLock()
  2740  	defer fs.mu.RUnlock()
  2741  
  2742  	// This can always be last for these purposes.
  2743  	validThrough = fs.state.LastSeq
  2744  
  2745  	if fs.state.Msgs == 0 || sseq > fs.state.LastSeq {
  2746  		return 0, validThrough
  2747  	}
  2748  
  2749  	// Track starting for both block for the sseq and staring block that matches any subject.
  2750  	var seqStart int
  2751  	// See if we need to figure out starting block per sseq.
  2752  	if sseq > fs.state.FirstSeq {
  2753  		// This should not, but can return -1, so make sure we check to avoid panic below.
  2754  		if seqStart, _ = fs.selectMsgBlockWithIndex(sseq); seqStart < 0 {
  2755  			seqStart = 0
  2756  		}
  2757  	}
  2758  
  2759  	isAll := filter == _EMPTY_ || filter == fwcs
  2760  	wc := subjectHasWildcard(filter)
  2761  
  2762  	// See if filter was provided but its the only subject.
  2763  	if !isAll && !wc && fs.psim.Size() == 1 {
  2764  		if _, ok := fs.psim.Find(stringToBytes(filter)); ok {
  2765  			isAll = true
  2766  		}
  2767  	}
  2768  	if isAll && filter == _EMPTY_ {
  2769  		filter = fwcs
  2770  	}
  2771  	// If we are isAll and have no deleted we can do a simpler calculation.
  2772  	if !lastPerSubject && isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs {
  2773  		if sseq == 0 {
  2774  			return fs.state.Msgs, validThrough
  2775  		}
  2776  		return fs.state.LastSeq - sseq + 1, validThrough
  2777  	}
  2778  
  2779  	var tsa, fsa [32]string
  2780  	fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
  2781  
  2782  	isMatch := func(subj string) bool {
  2783  		if isAll {
  2784  			return true
  2785  		}
  2786  		if !wc {
  2787  			return subj == filter
  2788  		}
  2789  		tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
  2790  		return isSubsetMatchTokenized(tts, fts)
  2791  	}
  2792  
  2793  	// Handle last by subject a bit differently.
  2794  	// We will scan PSIM since we accurately track the last block we have seen the subject in. This
  2795  	// allows us to only need to load at most one block now.
  2796  	// For the last block, we need to track the subjects that we know are in that block, and track seen
  2797  	// while in the block itself, but complexity there worth it.
  2798  	if lastPerSubject {
  2799  		// If we want all and our start sequence is equal or less than first return number of subjects.
  2800  		if isAll && sseq <= fs.state.FirstSeq {
  2801  			return uint64(fs.psim.Size()), validThrough
  2802  		}
  2803  		// If we are here we need to scan. We are going to scan the PSIM looking for lblks that are >= seqStart.
  2804  		// This will build up a list of all subjects from the selected block onward.
  2805  		lbm := make(map[string]bool)
  2806  		mb := fs.blks[seqStart]
  2807  		bi := mb.index
  2808  
  2809  		fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) {
  2810  			// If the select blk start is greater than entry's last blk skip.
  2811  			if bi > psi.lblk {
  2812  				return
  2813  			}
  2814  			total++
  2815  			// We will track the subjects that are an exact match to the last block.
  2816  			// This is needed for last block processing.
  2817  			if psi.lblk == bi {
  2818  				lbm[string(subj)] = true
  2819  			}
  2820  		})
  2821  
  2822  		// Now check if we need to inspect the seqStart block.
  2823  		// Grab write lock in case we need to load in msgs.
  2824  		mb.mu.Lock()
  2825  		var shouldExpire bool
  2826  		// We need to walk this block to correct accounting from above.
  2827  		if sseq > mb.first.seq {
  2828  			// Track the ones we add back in case more than one.
  2829  			seen := make(map[string]bool)
  2830  			// We need to discount the total by subjects seen before sseq, but also add them right back in if they are >= sseq for this blk.
  2831  			// This only should be subjects we know have the last blk in this block.
  2832  			if mb.cacheNotLoaded() {
  2833  				mb.loadMsgsWithLock()
  2834  				shouldExpire = true
  2835  			}
  2836  			var smv StoreMsg
  2837  			for seq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ {
  2838  				sm, _ := mb.cacheLookup(seq, &smv)
  2839  				if sm == nil || sm.subj == _EMPTY_ || !lbm[sm.subj] {
  2840  					continue
  2841  				}
  2842  				if isMatch(sm.subj) {
  2843  					// If less than sseq adjust off of total as long as this subject matched the last block.
  2844  					if seq < sseq {
  2845  						if !seen[sm.subj] {
  2846  							total--
  2847  							seen[sm.subj] = true
  2848  						}
  2849  					} else if seen[sm.subj] {
  2850  						// This is equal or more than sseq, so add back in.
  2851  						total++
  2852  						// Make sure to not process anymore.
  2853  						delete(seen, sm.subj)
  2854  					}
  2855  				}
  2856  			}
  2857  		}
  2858  		// If we loaded the block try to force expire.
  2859  		if shouldExpire {
  2860  			mb.tryForceExpireCacheLocked()
  2861  		}
  2862  		mb.mu.Unlock()
  2863  		return total, validThrough
  2864  	}
  2865  
  2866  	// If we would need to scan more from the beginning, revert back to calculating directly here.
  2867  	// TODO(dlc) - Redo properly with sublists etc for subject-based filtering.
  2868  	if seqStart >= (len(fs.blks) / 2) {
  2869  		for i := seqStart; i < len(fs.blks); i++ {
  2870  			var shouldExpire bool
  2871  			mb := fs.blks[i]
  2872  			// Hold write lock in case we need to load cache.
  2873  			mb.mu.Lock()
  2874  			var t uint64
  2875  			if isAll && sseq <= atomic.LoadUint64(&mb.first.seq) {
  2876  				total += mb.msgs
  2877  				mb.mu.Unlock()
  2878  				continue
  2879  			}
  2880  			// If we are here we need to at least scan the subject fss.
  2881  			// Make sure we have fss loaded.
  2882  			if mb.fssNotLoaded() {
  2883  				mb.loadMsgsWithLock()
  2884  				shouldExpire = true
  2885  			}
  2886  			var havePartial bool
  2887  			for subj, ss := range mb.fss {
  2888  				if isMatch(subj) {
  2889  					if ss.firstNeedsUpdate {
  2890  						mb.recalculateFirstForSubj(subj, ss.First, ss)
  2891  					}
  2892  					if sseq <= ss.First {
  2893  						t += ss.Msgs
  2894  					} else if sseq <= ss.Last {
  2895  						// We matched but its a partial.
  2896  						havePartial = true
  2897  						break
  2898  					}
  2899  				}
  2900  			}
  2901  			// See if we need to scan msgs here.
  2902  			if havePartial {
  2903  				// Make sure we have the cache loaded.
  2904  				if mb.cacheNotLoaded() {
  2905  					mb.loadMsgsWithLock()
  2906  					shouldExpire = true
  2907  				}
  2908  				// Clear on partial.
  2909  				t = 0
  2910  				var smv StoreMsg
  2911  				for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ {
  2912  					if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && isMatch(sm.subj) {
  2913  						t++
  2914  					}
  2915  				}
  2916  			}
  2917  			// If we loaded this block for this operation go ahead and expire it here.
  2918  			if shouldExpire {
  2919  				mb.tryForceExpireCacheLocked()
  2920  			}
  2921  			mb.mu.Unlock()
  2922  			total += t
  2923  		}
  2924  		return total, validThrough
  2925  	}
  2926  
  2927  	// If we are here it's better to calculate totals from psim and adjust downward by scanning less blocks.
  2928  	// TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead.
  2929  	start := uint32(math.MaxUint32)
  2930  	fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) {
  2931  		total += psi.total
  2932  		// Keep track of start index for this subject.
  2933  		if psi.fblk < start {
  2934  			start = psi.fblk
  2935  		}
  2936  	})
  2937  	// See if we were asked for all, if so we are done.
  2938  	if sseq <= fs.state.FirstSeq {
  2939  		return total, validThrough
  2940  	}
  2941  
  2942  	// If we are here we need to calculate partials for the first blocks.
  2943  	firstSubjBlk := fs.bim[start]
  2944  	var firstSubjBlkFound bool
  2945  	// Adjust in case not found.
  2946  	if firstSubjBlk == nil {
  2947  		firstSubjBlkFound = true
  2948  	}
  2949  
  2950  	// Track how many we need to adjust against the total.
  2951  	var adjust uint64
  2952  	for i := 0; i <= seqStart; i++ {
  2953  		mb := fs.blks[i]
  2954  		// We can skip blks if we know they are below the first one that has any subject matches.
  2955  		if !firstSubjBlkFound {
  2956  			if firstSubjBlkFound = (mb == firstSubjBlk); !firstSubjBlkFound {
  2957  				continue
  2958  			}
  2959  		}
  2960  		// We need to scan this block.
  2961  		var shouldExpire bool
  2962  		mb.mu.Lock()
  2963  		// Check if we should include all of this block in adjusting. If so work with metadata.
  2964  		if sseq > atomic.LoadUint64(&mb.last.seq) {
  2965  			if isAll {
  2966  				adjust += mb.msgs
  2967  			} else {
  2968  				// We need to adjust for all matches in this block.
  2969  				// Make sure we have fss loaded. This loads whole block now.
  2970  				if mb.fssNotLoaded() {
  2971  					mb.loadMsgsWithLock()
  2972  					shouldExpire = true
  2973  				}
  2974  				for subj, ss := range mb.fss {
  2975  					if isMatch(subj) {
  2976  						adjust += ss.Msgs
  2977  					}
  2978  				}
  2979  			}
  2980  		} else {
  2981  			// This is the last block. We need to scan per message here.
  2982  			if mb.cacheNotLoaded() {
  2983  				mb.loadMsgsWithLock()
  2984  				shouldExpire = true
  2985  			}
  2986  			var last = atomic.LoadUint64(&mb.last.seq)
  2987  			if sseq < last {
  2988  				last = sseq
  2989  			}
  2990  			// We need to walk all messages in this block
  2991  			var smv StoreMsg
  2992  			for seq := atomic.LoadUint64(&mb.first.seq); seq < last; seq++ {
  2993  				sm, _ := mb.cacheLookup(seq, &smv)
  2994  				if sm == nil || sm.subj == _EMPTY_ {
  2995  					continue
  2996  				}
  2997  				// Check if it matches our filter.
  2998  				if sm.seq < sseq && isMatch(sm.subj) {
  2999  					adjust++
  3000  				}
  3001  			}
  3002  		}
  3003  		// If we loaded the block try to force expire.
  3004  		if shouldExpire {
  3005  			mb.tryForceExpireCacheLocked()
  3006  		}
  3007  		mb.mu.Unlock()
  3008  	}
  3009  	// Make final adjustment.
  3010  	total -= adjust
  3011  
  3012  	return total, validThrough
  3013  }
  3014  
  3015  // SubjectsTotal return message totals per subject.
  3016  func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 {
  3017  	fs.mu.RLock()
  3018  	defer fs.mu.RUnlock()
  3019  
  3020  	if fs.psim.Size() == 0 {
  3021  		return nil
  3022  	}
  3023  	// Match all if no filter given.
  3024  	if filter == _EMPTY_ {
  3025  		filter = fwcs
  3026  	}
  3027  	fst := make(map[string]uint64)
  3028  	fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) {
  3029  		fst[string(subj)] = psi.total
  3030  	})
  3031  	return fst
  3032  }
  3033  
  3034  // RegisterStorageUpdates registers a callback for updates to storage changes.
  3035  // It will present number of messages and bytes as a signed integer and an
  3036  // optional sequence number of the message if a single.
  3037  func (fs *fileStore) RegisterStorageUpdates(cb StorageUpdateHandler) {
  3038  	fs.mu.Lock()
  3039  	fs.scb = cb
  3040  	bsz := fs.state.Bytes
  3041  	fs.mu.Unlock()
  3042  	if cb != nil && bsz > 0 {
  3043  		cb(0, int64(bsz), 0, _EMPTY_)
  3044  	}
  3045  }
  3046  
  3047  // Helper to get hash key for specific message block.
  3048  // Lock should be held
  3049  func (fs *fileStore) hashKeyForBlock(index uint32) []byte {
  3050  	return []byte(fmt.Sprintf("%s-%d", fs.cfg.Name, index))
  3051  }
  3052  
  3053  func (mb *msgBlock) setupWriteCache(buf []byte) {
  3054  	// Make sure we have a cache setup.
  3055  	if mb.cache != nil {
  3056  		return
  3057  	}
  3058  
  3059  	// Setup simple cache.
  3060  	mb.cache = &cache{buf: buf}
  3061  	// Make sure we set the proper cache offset if we have existing data.
  3062  	var fi os.FileInfo
  3063  	if mb.mfd != nil {
  3064  		fi, _ = mb.mfd.Stat()
  3065  	} else if mb.mfn != _EMPTY_ {
  3066  		fi, _ = os.Stat(mb.mfn)
  3067  	}
  3068  	if fi != nil {
  3069  		mb.cache.off = int(fi.Size())
  3070  	}
  3071  	mb.llts = time.Now().UnixNano()
  3072  	mb.startCacheExpireTimer()
  3073  }
  3074  
  3075  // This rolls to a new append msg block.
  3076  // Lock should be held.
  3077  func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) {
  3078  	index := uint32(1)
  3079  	var rbuf []byte
  3080  
  3081  	if lmb := fs.lmb; lmb != nil {
  3082  		index = lmb.index + 1
  3083  		// Determine if we can reclaim any resources here.
  3084  		if fs.fip {
  3085  			lmb.mu.Lock()
  3086  			lmb.closeFDsLocked()
  3087  			if lmb.cache != nil {
  3088  				// Reset write timestamp and see if we can expire this cache.
  3089  				rbuf = lmb.tryExpireWriteCache()
  3090  			}
  3091  			lmb.mu.Unlock()
  3092  		}
  3093  	}
  3094  
  3095  	mb := fs.initMsgBlock(index)
  3096  	// Lock should be held to quiet race detector.
  3097  	mb.mu.Lock()
  3098  	mb.setupWriteCache(rbuf)
  3099  	mb.fss = make(map[string]*SimpleState)
  3100  
  3101  	// Set cache time to creation time to start.
  3102  	ts := time.Now().UnixNano()
  3103  	mb.llts, mb.lwts = 0, ts
  3104  	// Remember our last sequence number.
  3105  	atomic.StoreUint64(&mb.first.seq, fs.state.LastSeq+1)
  3106  	atomic.StoreUint64(&mb.last.seq, fs.state.LastSeq)
  3107  	mb.mu.Unlock()
  3108  
  3109  	// Now do local hash.
  3110  	key := sha256.Sum256(fs.hashKeyForBlock(index))
  3111  	hh, err := highwayhash.New64(key[:])
  3112  	if err != nil {
  3113  		return nil, fmt.Errorf("could not create hash: %v", err)
  3114  	}
  3115  	mb.hh = hh
  3116  
  3117  	<-dios
  3118  	mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
  3119  	dios <- struct{}{}
  3120  
  3121  	if err != nil {
  3122  		mb.dirtyCloseWithRemove(true)
  3123  		return nil, fmt.Errorf("Error creating msg block file: %v", err)
  3124  	}
  3125  	mb.mfd = mfd
  3126  
  3127  	// Check if encryption is enabled.
  3128  	if fs.prf != nil {
  3129  		if err := fs.genEncryptionKeysForBlock(mb); err != nil {
  3130  			return nil, err
  3131  		}
  3132  	}
  3133  
  3134  	// If we know we will need this so go ahead and spin up.
  3135  	if !fs.fip {
  3136  		mb.spinUpFlushLoop()
  3137  	}
  3138  
  3139  	// Add to our list of blocks and mark as last.
  3140  	fs.addMsgBlock(mb)
  3141  
  3142  	if fs.dirty > 0 {
  3143  		fs.kickFlushStateLoop()
  3144  	}
  3145  
  3146  	return mb, nil
  3147  }
  3148  
  3149  // Generate the keys for this message block and write them out.
  3150  func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error {
  3151  	if mb == nil {
  3152  		return nil
  3153  	}
  3154  	key, bek, seed, encrypted, err := fs.genEncryptionKeys(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))
  3155  	if err != nil {
  3156  		return err
  3157  	}
  3158  	mb.aek, mb.bek, mb.seed, mb.nonce = key, bek, seed, encrypted[:key.NonceSize()]
  3159  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  3160  	keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))
  3161  	if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
  3162  		return err
  3163  	}
  3164  	<-dios
  3165  	err = os.WriteFile(keyFile, encrypted, defaultFilePerms)
  3166  	dios <- struct{}{}
  3167  	if err != nil {
  3168  		return err
  3169  	}
  3170  	mb.kfn = keyFile
  3171  	return nil
  3172  }
  3173  
  3174  // Stores a raw message with expected sequence number and timestamp.
  3175  // Lock should be held.
  3176  func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) (err error) {
  3177  	if fs.closed {
  3178  		return ErrStoreClosed
  3179  	}
  3180  
  3181  	// Per subject max check needed.
  3182  	mmp := uint64(fs.cfg.MaxMsgsPer)
  3183  	var psmc uint64
  3184  	psmax := mmp > 0 && len(subj) > 0
  3185  	if psmax {
  3186  		if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
  3187  			psmc = info.total
  3188  		}
  3189  	}
  3190  
  3191  	var fseq uint64
  3192  	// Check if we are discarding new messages when we reach the limit.
  3193  	if fs.cfg.Discard == DiscardNew {
  3194  		var asl bool
  3195  		if psmax && psmc >= mmp {
  3196  			// If we are instructed to discard new per subject, this is an error.
  3197  			if fs.cfg.DiscardNewPer {
  3198  				return ErrMaxMsgsPerSubject
  3199  			}
  3200  			if fseq, err = fs.firstSeqForSubj(subj); err != nil {
  3201  				return err
  3202  			}
  3203  			asl = true
  3204  		}
  3205  		if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl {
  3206  			return ErrMaxMsgs
  3207  		}
  3208  		if fs.cfg.MaxBytes > 0 && fs.state.Bytes+fileStoreMsgSize(subj, hdr, msg) >= uint64(fs.cfg.MaxBytes) {
  3209  			if !asl || fs.sizeForSeq(fseq) <= int(fileStoreMsgSize(subj, hdr, msg)) {
  3210  				return ErrMaxBytes
  3211  			}
  3212  		}
  3213  	}
  3214  
  3215  	// Check sequence.
  3216  	if seq != fs.state.LastSeq+1 {
  3217  		if seq > 0 {
  3218  			return ErrSequenceMismatch
  3219  		}
  3220  		seq = fs.state.LastSeq + 1
  3221  	}
  3222  
  3223  	// Write msg record.
  3224  	n, err := fs.writeMsgRecord(seq, ts, subj, hdr, msg)
  3225  	if err != nil {
  3226  		return err
  3227  	}
  3228  
  3229  	// Adjust top level tracking of per subject msg counts.
  3230  	if len(subj) > 0 && fs.psim != nil {
  3231  		index := fs.lmb.index
  3232  		if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
  3233  			info.total++
  3234  			if index > info.lblk {
  3235  				info.lblk = index
  3236  			}
  3237  		} else {
  3238  			fs.psim.Insert(stringToBytes(subj), psi{total: 1, fblk: index, lblk: index})
  3239  			fs.tsl += len(subj)
  3240  		}
  3241  	}
  3242  
  3243  	// Adjust first if needed.
  3244  	now := time.Unix(0, ts).UTC()
  3245  	if fs.state.Msgs == 0 {
  3246  		fs.state.FirstSeq = seq
  3247  		fs.state.FirstTime = now
  3248  	}
  3249  
  3250  	fs.state.Msgs++
  3251  	fs.state.Bytes += n
  3252  	fs.state.LastSeq = seq
  3253  	fs.state.LastTime = now
  3254  
  3255  	// Enforce per message limits.
  3256  	// We snapshotted psmc before our actual write, so >= comparison needed.
  3257  	if psmax && psmc >= mmp {
  3258  		// We may have done this above.
  3259  		if fseq == 0 {
  3260  			fseq, _ = fs.firstSeqForSubj(subj)
  3261  		}
  3262  		if ok, _ := fs.removeMsgViaLimits(fseq); ok {
  3263  			// Make sure we are below the limit.
  3264  			if psmc--; psmc >= mmp {
  3265  				bsubj := stringToBytes(subj)
  3266  				for info, ok := fs.psim.Find(bsubj); ok && info.total > mmp; info, ok = fs.psim.Find(bsubj) {
  3267  					if seq, _ := fs.firstSeqForSubj(subj); seq > 0 {
  3268  						if ok, _ := fs.removeMsgViaLimits(seq); !ok {
  3269  							break
  3270  						}
  3271  					} else {
  3272  						break
  3273  					}
  3274  				}
  3275  			}
  3276  		} else if mb := fs.selectMsgBlock(fseq); mb != nil {
  3277  			// If we are here we could not remove fseq from above, so rebuild.
  3278  			var ld *LostStreamData
  3279  			if ld, _, _ = mb.rebuildState(); ld != nil {
  3280  				fs.rebuildStateLocked(ld)
  3281  			}
  3282  		}
  3283  	}
  3284  
  3285  	// Limits checks and enforcement.
  3286  	// If they do any deletions they will update the
  3287  	// byte count on their own, so no need to compensate.
  3288  	fs.enforceMsgLimit()
  3289  	fs.enforceBytesLimit()
  3290  
  3291  	// Check if we have and need the age expiration timer running.
  3292  	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
  3293  		fs.startAgeChk()
  3294  	}
  3295  
  3296  	return nil
  3297  }
  3298  
  3299  // StoreRawMsg stores a raw message with expected sequence number and timestamp.
  3300  func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) error {
  3301  	fs.mu.Lock()
  3302  	err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
  3303  	cb := fs.scb
  3304  	// Check if first message timestamp requires expiry
  3305  	// sooner than initial replica expiry timer set to MaxAge when initializing.
  3306  	if !fs.receivedAny && fs.cfg.MaxAge != 0 && ts > 0 {
  3307  		fs.receivedAny = true
  3308  		// don't block here by calling expireMsgs directly.
  3309  		// Instead, set short timeout.
  3310  		fs.resetAgeChk(int64(time.Millisecond * 50))
  3311  	}
  3312  	fs.mu.Unlock()
  3313  
  3314  	if err == nil && cb != nil {
  3315  		cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
  3316  	}
  3317  
  3318  	return err
  3319  }
  3320  
  3321  // Store stores a message. We hold the main filestore lock for any write operation.
  3322  func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) {
  3323  	fs.mu.Lock()
  3324  	seq, ts := fs.state.LastSeq+1, time.Now().UnixNano()
  3325  	err := fs.storeRawMsg(subj, hdr, msg, seq, ts)
  3326  	cb := fs.scb
  3327  	fs.mu.Unlock()
  3328  
  3329  	if err != nil {
  3330  		seq, ts = 0, 0
  3331  	} else if cb != nil {
  3332  		cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj)
  3333  	}
  3334  
  3335  	return seq, ts, err
  3336  }
  3337  
  3338  // skipMsg will update this message block for a skipped message.
  3339  // If we do not have any messages, just update the metadata, otherwise
  3340  // we will place an empty record marking the sequence as used. The
  3341  // sequence will be marked erased.
  3342  // fs lock should be held.
  3343  func (mb *msgBlock) skipMsg(seq uint64, now time.Time) {
  3344  	if mb == nil {
  3345  		return
  3346  	}
  3347  	var needsRecord bool
  3348  
  3349  	nowts := now.UnixNano()
  3350  
  3351  	mb.mu.Lock()
  3352  	// If we are empty can just do meta.
  3353  	if mb.msgs == 0 {
  3354  		atomic.StoreUint64(&mb.last.seq, seq)
  3355  		mb.last.ts = nowts
  3356  		atomic.StoreUint64(&mb.first.seq, seq+1)
  3357  		mb.first.ts = nowts
  3358  	} else {
  3359  		needsRecord = true
  3360  		mb.dmap.Insert(seq)
  3361  	}
  3362  	mb.mu.Unlock()
  3363  
  3364  	if needsRecord {
  3365  		mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true)
  3366  	} else {
  3367  		mb.kickFlusher()
  3368  	}
  3369  }
  3370  
  3371  // SkipMsg will use the next sequence number but not store anything.
  3372  func (fs *fileStore) SkipMsg() uint64 {
  3373  	fs.mu.Lock()
  3374  	defer fs.mu.Unlock()
  3375  
  3376  	// Grab our current last message block.
  3377  	mb := fs.lmb
  3378  	if mb == nil || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize {
  3379  		if mb != nil && fs.fcfg.Compression != NoCompression {
  3380  			// We've now reached the end of this message block, if we want
  3381  			// to compress blocks then now's the time to do it.
  3382  			go mb.recompressOnDiskIfNeeded()
  3383  		}
  3384  		var err error
  3385  		if mb, err = fs.newMsgBlockForWrite(); err != nil {
  3386  			return 0
  3387  		}
  3388  	}
  3389  
  3390  	// Grab time and last seq.
  3391  	now, seq := time.Now().UTC(), fs.state.LastSeq+1
  3392  
  3393  	// Write skip msg.
  3394  	mb.skipMsg(seq, now)
  3395  
  3396  	// Update fs state.
  3397  	fs.state.LastSeq, fs.state.LastTime = seq, now
  3398  	if fs.state.Msgs == 0 {
  3399  		fs.state.FirstSeq, fs.state.FirstTime = seq, now
  3400  	}
  3401  	if seq == fs.state.FirstSeq {
  3402  		fs.state.FirstSeq, fs.state.FirstTime = seq+1, now
  3403  	}
  3404  	// Mark as dirty for stream state.
  3405  	fs.dirty++
  3406  
  3407  	return seq
  3408  }
  3409  
  3410  // Skip multiple msgs. We will determine if we can fit into current lmb or we need to create a new block.
  3411  func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error {
  3412  	fs.mu.Lock()
  3413  	defer fs.mu.Unlock()
  3414  
  3415  	// Check sequence matches our last sequence.
  3416  	if seq != fs.state.LastSeq+1 {
  3417  		if seq > 0 {
  3418  			return ErrSequenceMismatch
  3419  		}
  3420  		seq = fs.state.LastSeq + 1
  3421  	}
  3422  
  3423  	// Limit number of dmap entries
  3424  	const maxDeletes = 64 * 1024
  3425  	mb := fs.lmb
  3426  
  3427  	numDeletes := int(num)
  3428  	if mb != nil {
  3429  		numDeletes += mb.dmap.Size()
  3430  	}
  3431  	if mb == nil || numDeletes > maxDeletes && mb.msgs > 0 || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize {
  3432  		if mb != nil && fs.fcfg.Compression != NoCompression {
  3433  			// We've now reached the end of this message block, if we want
  3434  			// to compress blocks then now's the time to do it.
  3435  			go mb.recompressOnDiskIfNeeded()
  3436  		}
  3437  		var err error
  3438  		if mb, err = fs.newMsgBlockForWrite(); err != nil {
  3439  			return err
  3440  		}
  3441  	}
  3442  
  3443  	// Insert into dmap all entries and place last as marker.
  3444  	now := time.Now().UTC()
  3445  	nowts := now.UnixNano()
  3446  	lseq := seq + num - 1
  3447  
  3448  	mb.mu.Lock()
  3449  	var needsRecord bool
  3450  	// If we are empty update meta directly.
  3451  	if mb.msgs == 0 {
  3452  		atomic.StoreUint64(&mb.last.seq, lseq)
  3453  		mb.last.ts = nowts
  3454  		atomic.StoreUint64(&mb.first.seq, lseq+1)
  3455  		mb.first.ts = nowts
  3456  	} else {
  3457  		needsRecord = true
  3458  		for ; seq <= lseq; seq++ {
  3459  			mb.dmap.Insert(seq)
  3460  		}
  3461  	}
  3462  	mb.mu.Unlock()
  3463  
  3464  	// Write out our placeholder.
  3465  	if needsRecord {
  3466  		mb.writeMsgRecord(emptyRecordLen, lseq|ebit, _EMPTY_, nil, nil, nowts, true)
  3467  	}
  3468  
  3469  	// Now update FS accounting.
  3470  	// Update fs state.
  3471  	fs.state.LastSeq, fs.state.LastTime = lseq, now
  3472  	if fs.state.Msgs == 0 {
  3473  		fs.state.FirstSeq, fs.state.FirstTime = lseq+1, now
  3474  	}
  3475  
  3476  	// Mark as dirty for stream state.
  3477  	fs.dirty++
  3478  
  3479  	return nil
  3480  }
  3481  
  3482  // Lock should be held.
  3483  func (fs *fileStore) rebuildFirst() {
  3484  	if len(fs.blks) == 0 {
  3485  		return
  3486  	}
  3487  	fmb := fs.blks[0]
  3488  	if fmb == nil {
  3489  		return
  3490  	}
  3491  
  3492  	ld, _, _ := fmb.rebuildState()
  3493  	fmb.mu.RLock()
  3494  	isEmpty := fmb.msgs == 0
  3495  	fmb.mu.RUnlock()
  3496  	if isEmpty {
  3497  		fmb.mu.Lock()
  3498  		fs.removeMsgBlock(fmb)
  3499  		fmb.mu.Unlock()
  3500  	}
  3501  	fs.selectNextFirst()
  3502  	fs.rebuildStateLocked(ld)
  3503  }
  3504  
  3505  // Optimized helper function to return first sequence.
  3506  // subj will always be publish subject here, meaning non-wildcard.
  3507  // We assume a fast check that this subj even exists already happened.
  3508  // Lock should be held.
  3509  func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) {
  3510  	if len(fs.blks) == 0 {
  3511  		return 0, nil
  3512  	}
  3513  
  3514  	// See if we can optimize where we start.
  3515  	start, stop := fs.blks[0].index, fs.lmb.index
  3516  	if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
  3517  		start, stop = info.fblk, info.lblk
  3518  	}
  3519  
  3520  	for i := start; i <= stop; i++ {
  3521  		mb := fs.bim[i]
  3522  		if mb == nil {
  3523  			continue
  3524  		}
  3525  		mb.mu.Lock()
  3526  		var shouldExpire bool
  3527  		if mb.fssNotLoaded() {
  3528  			// Make sure we have fss loaded.
  3529  			if err := mb.loadMsgsWithLock(); err != nil {
  3530  				mb.mu.Unlock()
  3531  				return 0, err
  3532  			}
  3533  			shouldExpire = true
  3534  		}
  3535  		if ss := mb.fss[subj]; ss != nil {
  3536  			// Adjust first if it was not where we thought it should be.
  3537  			if i != start {
  3538  				if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
  3539  					info.fblk = i
  3540  				}
  3541  			}
  3542  			if ss.firstNeedsUpdate {
  3543  				mb.recalculateFirstForSubj(subj, ss.First, ss)
  3544  			}
  3545  			mb.mu.Unlock()
  3546  			return ss.First, nil
  3547  		}
  3548  		// If we did not find it and we loaded this msgBlock try to expire as long as not the last.
  3549  		if shouldExpire {
  3550  			// Expire this cache before moving on.
  3551  			mb.tryForceExpireCacheLocked()
  3552  		}
  3553  		mb.mu.Unlock()
  3554  	}
  3555  	return 0, nil
  3556  }
  3557  
  3558  // Will check the msg limit and drop firstSeq msg if needed.
  3559  // Lock should be held.
  3560  func (fs *fileStore) enforceMsgLimit() {
  3561  	if fs.cfg.Discard != DiscardOld {
  3562  		return
  3563  	}
  3564  	if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) {
  3565  		return
  3566  	}
  3567  	for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs {
  3568  		if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
  3569  			fs.rebuildFirst()
  3570  			return
  3571  		}
  3572  	}
  3573  }
  3574  
  3575  // Will check the bytes limit and drop msgs if needed.
  3576  // Lock should be held.
  3577  func (fs *fileStore) enforceBytesLimit() {
  3578  	if fs.cfg.Discard != DiscardOld {
  3579  		return
  3580  	}
  3581  	if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) {
  3582  		return
  3583  	}
  3584  	for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes {
  3585  		if removed, err := fs.deleteFirstMsg(); err != nil || !removed {
  3586  			fs.rebuildFirst()
  3587  			return
  3588  		}
  3589  	}
  3590  }
  3591  
  3592  // Will make sure we have limits honored for max msgs per subject on recovery or config update.
  3593  // We will make sure to go through all msg blocks etc. but in practice this
  3594  // will most likely only be the last one, so can take a more conservative approach.
  3595  // Lock should be held.
  3596  func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) {
  3597  	maxMsgsPer := uint64(fs.cfg.MaxMsgsPer)
  3598  
  3599  	// We may want to suppress callbacks from remove during this process
  3600  	// since these should have already been deleted and accounted for.
  3601  	if !fireCallback {
  3602  		cb := fs.scb
  3603  		fs.scb = nil
  3604  		defer func() { fs.scb = cb }()
  3605  	}
  3606  
  3607  	var numMsgs uint64
  3608  
  3609  	// collect all that are not correct.
  3610  	needAttention := make(map[string]*psi)
  3611  	fs.psim.Iter(func(subj []byte, psi *psi) bool {
  3612  		numMsgs += psi.total
  3613  		if psi.total > maxMsgsPer {
  3614  			needAttention[string(subj)] = psi
  3615  		}
  3616  		return true
  3617  	})
  3618  
  3619  	// We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught.
  3620  	// So do a quick sanity check here. If we detect a skew do a rebuild then re-check.
  3621  	if numMsgs != fs.state.Msgs {
  3622  		fs.warn("Detected skew in subject-based total (%d) vs raw total (%d), rebuilding", numMsgs, fs.state.Msgs)
  3623  		// Clear any global subject state.
  3624  		fs.psim, fs.tsl = fs.psim.Empty(), 0
  3625  		for _, mb := range fs.blks {
  3626  			ld, _, err := mb.rebuildState()
  3627  			if err != nil && ld != nil {
  3628  				fs.addLostData(ld)
  3629  			}
  3630  			fs.populateGlobalPerSubjectInfo(mb)
  3631  		}
  3632  		// Rebuild fs state too.
  3633  		fs.rebuildStateLocked(nil)
  3634  		// Need to redo blocks that need attention.
  3635  		needAttention = make(map[string]*psi)
  3636  		fs.psim.Iter(func(subj []byte, psi *psi) bool {
  3637  			if psi.total > maxMsgsPer {
  3638  				needAttention[string(subj)] = psi
  3639  			}
  3640  			return true
  3641  		})
  3642  	}
  3643  
  3644  	// Collect all the msgBlks we alter.
  3645  	blks := make(map[*msgBlock]struct{})
  3646  
  3647  	// For re-use below.
  3648  	var sm StoreMsg
  3649  
  3650  	// Walk all subjects that need attention here.
  3651  	for subj, info := range needAttention {
  3652  		total, start, stop := info.total, info.fblk, info.lblk
  3653  
  3654  		for i := start; i <= stop; i++ {
  3655  			mb := fs.bim[i]
  3656  			if mb == nil {
  3657  				continue
  3658  			}
  3659  			// Grab the ss entry for this subject in case sparse.
  3660  			mb.mu.Lock()
  3661  			mb.ensurePerSubjectInfoLoaded()
  3662  			ss := mb.fss[subj]
  3663  			if ss != nil && ss.firstNeedsUpdate {
  3664  				mb.recalculateFirstForSubj(subj, ss.First, ss)
  3665  			}
  3666  			mb.mu.Unlock()
  3667  			if ss == nil {
  3668  				continue
  3669  			}
  3670  			for seq := ss.First; seq <= ss.Last && total > maxMsgsPer; {
  3671  				m, _, err := mb.firstMatching(subj, false, seq, &sm)
  3672  				if err == nil {
  3673  					seq = m.seq + 1
  3674  					if removed, _ := fs.removeMsgViaLimits(m.seq); removed {
  3675  						total--
  3676  						blks[mb] = struct{}{}
  3677  					}
  3678  				} else {
  3679  					// On error just do single increment.
  3680  					seq++
  3681  				}
  3682  			}
  3683  		}
  3684  	}
  3685  
  3686  	// Expire the cache if we can.
  3687  	for mb := range blks {
  3688  		mb.mu.Lock()
  3689  		if mb.msgs > 0 {
  3690  			mb.tryForceExpireCacheLocked()
  3691  		}
  3692  		mb.mu.Unlock()
  3693  	}
  3694  }
  3695  
  3696  // Lock should be held.
  3697  func (fs *fileStore) deleteFirstMsg() (bool, error) {
  3698  	return fs.removeMsgViaLimits(fs.state.FirstSeq)
  3699  }
  3700  
  3701  // If we remove via limits that can always be recovered on a restart we
  3702  // do not force the system to update the index file.
  3703  // Lock should be held.
  3704  func (fs *fileStore) removeMsgViaLimits(seq uint64) (bool, error) {
  3705  	return fs.removeMsg(seq, false, true, false)
  3706  }
  3707  
  3708  // RemoveMsg will remove the message from this store.
  3709  // Will return the number of bytes removed.
  3710  func (fs *fileStore) RemoveMsg(seq uint64) (bool, error) {
  3711  	return fs.removeMsg(seq, false, false, true)
  3712  }
  3713  
  3714  func (fs *fileStore) EraseMsg(seq uint64) (bool, error) {
  3715  	return fs.removeMsg(seq, true, false, true)
  3716  }
  3717  
  3718  // Convenience function to remove per subject tracking at the filestore level.
  3719  // Lock should be held.
  3720  func (fs *fileStore) removePerSubject(subj string) {
  3721  	if len(subj) == 0 || fs.psim == nil {
  3722  		return
  3723  	}
  3724  	// We do not update sense of fblk here but will do so when we resolve during lookup.
  3725  	bsubj := stringToBytes(subj)
  3726  	if info, ok := fs.psim.Find(bsubj); ok {
  3727  		info.total--
  3728  		if info.total == 1 {
  3729  			info.fblk = info.lblk
  3730  		} else if info.total == 0 {
  3731  			if _, ok = fs.psim.Delete(bsubj); ok {
  3732  				fs.tsl -= len(subj)
  3733  			}
  3734  		}
  3735  	}
  3736  }
  3737  
  3738  // Remove a message, optionally rewriting the mb file.
  3739  func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) (bool, error) {
  3740  	if seq == 0 {
  3741  		return false, ErrStoreMsgNotFound
  3742  	}
  3743  	fsLock := func() {
  3744  		if needFSLock {
  3745  			fs.mu.Lock()
  3746  		}
  3747  	}
  3748  	fsUnlock := func() {
  3749  		if needFSLock {
  3750  			fs.mu.Unlock()
  3751  		}
  3752  	}
  3753  
  3754  	fsLock()
  3755  
  3756  	if fs.closed {
  3757  		fsUnlock()
  3758  		return false, ErrStoreClosed
  3759  	}
  3760  	if !viaLimits && fs.sips > 0 {
  3761  		fsUnlock()
  3762  		return false, ErrStoreSnapshotInProgress
  3763  	}
  3764  	// If in encrypted mode negate secure rewrite here.
  3765  	if secure && fs.prf != nil {
  3766  		secure = false
  3767  	}
  3768  
  3769  	if fs.state.Msgs == 0 {
  3770  		var err = ErrStoreEOF
  3771  		if seq <= fs.state.LastSeq {
  3772  			err = ErrStoreMsgNotFound
  3773  		}
  3774  		fsUnlock()
  3775  		return false, err
  3776  	}
  3777  
  3778  	mb := fs.selectMsgBlock(seq)
  3779  	if mb == nil {
  3780  		var err = ErrStoreEOF
  3781  		if seq <= fs.state.LastSeq {
  3782  			err = ErrStoreMsgNotFound
  3783  		}
  3784  		fsUnlock()
  3785  		return false, err
  3786  	}
  3787  
  3788  	mb.mu.Lock()
  3789  
  3790  	// See if we are closed or the sequence number is still relevant.
  3791  	if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) {
  3792  		mb.mu.Unlock()
  3793  		fsUnlock()
  3794  		return false, nil
  3795  	}
  3796  
  3797  	// Now check dmap if it is there.
  3798  	if mb.dmap.Exists(seq) {
  3799  		mb.mu.Unlock()
  3800  		fsUnlock()
  3801  		return false, nil
  3802  	}
  3803  
  3804  	// We used to not have to load in the messages except with callbacks or the filtered subject state (which is now always on).
  3805  	// Now just load regardless.
  3806  	// TODO(dlc) - Figure out a way not to have to load it in, we need subject tracking outside main data block.
  3807  	if mb.cacheNotLoaded() {
  3808  		// We do not want to block possible activity within another msg block.
  3809  		// We have to unlock both locks and acquire the mb lock in the loadMsgs() call to avoid a deadlock if another
  3810  		// go routine was trying to get fs then this mb lock at the same time. E.g. another call to remove for same block.
  3811  		mb.mu.Unlock()
  3812  		fsUnlock()
  3813  		if err := mb.loadMsgs(); err != nil {
  3814  			return false, err
  3815  		}
  3816  		fsLock()
  3817  		// We need to check if things changed out from underneath us.
  3818  		if fs.closed {
  3819  			fsUnlock()
  3820  			return false, ErrStoreClosed
  3821  		}
  3822  		mb.mu.Lock()
  3823  		if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) {
  3824  			mb.mu.Unlock()
  3825  			fsUnlock()
  3826  			return false, nil
  3827  		}
  3828  		// cacheLookup below will do dmap check so no need to repeat here.
  3829  	}
  3830  
  3831  	var smv StoreMsg
  3832  	sm, err := mb.cacheLookup(seq, &smv)
  3833  	if err != nil {
  3834  		mb.mu.Unlock()
  3835  		fsUnlock()
  3836  		// Mimic err behavior from above check to dmap. No error returned if already removed.
  3837  		if err == errDeletedMsg {
  3838  			err = nil
  3839  		}
  3840  		return false, err
  3841  	}
  3842  	// Grab size
  3843  	msz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  3844  
  3845  	// Set cache timestamp for last remove.
  3846  	mb.lrts = time.Now().UnixNano()
  3847  
  3848  	// Global stats
  3849  	if fs.state.Msgs > 0 {
  3850  		fs.state.Msgs--
  3851  	}
  3852  	if msz < fs.state.Bytes {
  3853  		fs.state.Bytes -= msz
  3854  	} else {
  3855  		fs.state.Bytes = 0
  3856  	}
  3857  
  3858  	// Now local mb updates.
  3859  	if mb.msgs > 0 {
  3860  		mb.msgs--
  3861  	}
  3862  	if msz < mb.bytes {
  3863  		mb.bytes -= msz
  3864  	} else {
  3865  		mb.bytes = 0
  3866  	}
  3867  
  3868  	// Mark as dirty for stream state.
  3869  	fs.dirty++
  3870  
  3871  	// If we are tracking subjects here make sure we update that accounting.
  3872  	mb.ensurePerSubjectInfoLoaded()
  3873  
  3874  	// If we are tracking multiple subjects here make sure we update that accounting.
  3875  	mb.removeSeqPerSubject(sm.subj, seq)
  3876  	fs.removePerSubject(sm.subj)
  3877  
  3878  	if secure {
  3879  		// Grab record info.
  3880  		ri, rl, _, _ := mb.slotInfo(int(seq - mb.cache.fseq))
  3881  		mb.eraseMsg(seq, int(ri), int(rl))
  3882  	}
  3883  
  3884  	fifo := seq == atomic.LoadUint64(&mb.first.seq)
  3885  	isLastBlock := mb == fs.lmb
  3886  	isEmpty := mb.msgs == 0
  3887  
  3888  	if fifo {
  3889  		mb.selectNextFirst()
  3890  		if !isEmpty {
  3891  			// Can update this one in place.
  3892  			if seq == fs.state.FirstSeq {
  3893  				fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one.
  3894  				if mb.first.ts == 0 {
  3895  					fs.state.FirstTime = time.Time{}
  3896  				} else {
  3897  					fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  3898  				}
  3899  			}
  3900  		}
  3901  	} else if !isEmpty {
  3902  		// Out of order delete.
  3903  		mb.dmap.Insert(seq)
  3904  		// Make simple check here similar to Compact(). If we can save 50% and over a certain threshold do inline.
  3905  		// All other more thorough cleanup will happen in syncBlocks logic.
  3906  		// Note that we do not have to store empty records for the deleted, so don't use to calculate.
  3907  		// TODO(dlc) - This should not be inline, should kick the sync routine.
  3908  		if mb.rbytes > compactMinimum && mb.bytes*2 < mb.rbytes && !isLastBlock {
  3909  			mb.compact()
  3910  			fs.kickFlushStateLoop()
  3911  		}
  3912  	}
  3913  
  3914  	if secure {
  3915  		if ld, _ := mb.flushPendingMsgsLocked(); ld != nil {
  3916  			// We have the mb lock here, this needs the mb locks so do in its own go routine.
  3917  			go fs.rebuildState(ld)
  3918  		}
  3919  	}
  3920  
  3921  	// If empty remove this block and check if we need to update first sequence.
  3922  	// We will write a tombstone at the end.
  3923  	var firstSeqNeedsUpdate bool
  3924  	if isEmpty {
  3925  		// This writes tombstone iff mb == lmb, so no need to do below.
  3926  		fs.removeMsgBlock(mb)
  3927  		firstSeqNeedsUpdate = seq == fs.state.FirstSeq
  3928  	}
  3929  	mb.mu.Unlock()
  3930  
  3931  	// If we emptied the current message block and the seq was state.FirstSeq
  3932  	// then we need to jump message blocks. We will also write the index so
  3933  	// we don't lose track of the first sequence.
  3934  	if firstSeqNeedsUpdate {
  3935  		fs.selectNextFirst()
  3936  	}
  3937  
  3938  	// Check if we need to write a deleted record tombstone.
  3939  	// This is for user initiated removes or to hold the first seq
  3940  	// when the last block is empty.
  3941  
  3942  	// If not via limits and not empty and last (empty writes tombstone above if last) write tombstone.
  3943  	if !viaLimits && !(isEmpty && isLastBlock) {
  3944  		if lmb := fs.lmb; sm != nil && lmb != nil {
  3945  			lmb.writeTombstone(sm.seq, sm.ts)
  3946  		}
  3947  	}
  3948  
  3949  	if cb := fs.scb; cb != nil {
  3950  		// If we have a callback registered we need to release lock regardless since cb might need it to lookup msg, etc.
  3951  		fs.mu.Unlock()
  3952  		// Storage updates.
  3953  		var subj string
  3954  		if sm != nil {
  3955  			subj = sm.subj
  3956  		}
  3957  		delta := int64(msz)
  3958  		cb(-1, -delta, seq, subj)
  3959  
  3960  		if !needFSLock {
  3961  			fs.mu.Lock()
  3962  		}
  3963  	} else if needFSLock {
  3964  		// We acquired it so release it.
  3965  		fs.mu.Unlock()
  3966  	}
  3967  
  3968  	return true, nil
  3969  }
  3970  
  3971  // This will compact and rewrite this block. This should only be called when we know we want to rewrite this block.
  3972  // This should not be called on the lmb since we will prune tail deleted messages which could cause issues with
  3973  // writing new messages. We will silently bail on any issues with the underlying block and let someone else detect.
  3974  // Write lock needs to be held.
  3975  func (mb *msgBlock) compact() {
  3976  	wasLoaded := mb.cacheAlreadyLoaded()
  3977  	if !wasLoaded {
  3978  		if err := mb.loadMsgsWithLock(); err != nil {
  3979  			return
  3980  		}
  3981  	}
  3982  
  3983  	buf := mb.cache.buf
  3984  	nbuf := getMsgBlockBuf(len(buf))
  3985  	// Recycle our nbuf when we are done.
  3986  	defer recycleMsgBlockBuf(nbuf)
  3987  
  3988  	var le = binary.LittleEndian
  3989  	var firstSet bool
  3990  
  3991  	fseq := atomic.LoadUint64(&mb.first.seq)
  3992  	isDeleted := func(seq uint64) bool {
  3993  		return seq == 0 || seq&ebit != 0 || mb.dmap.Exists(seq) || seq < fseq
  3994  	}
  3995  
  3996  	for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; {
  3997  		if index+msgHdrSize > lbuf {
  3998  			return
  3999  		}
  4000  		hdr := buf[index : index+msgHdrSize]
  4001  		rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:])
  4002  		// Clear any headers bit that could be set.
  4003  		rl &^= hbit
  4004  		dlen := int(rl) - msgHdrSize
  4005  		// Do some quick sanity checks here.
  4006  		if dlen < 0 || int(slen) > dlen || dlen > int(rl) || rl > rlBadThresh || index+rl > lbuf {
  4007  			return
  4008  		}
  4009  		// Only need to process non-deleted messages.
  4010  		seq := le.Uint64(hdr[4:])
  4011  
  4012  		if !isDeleted(seq) {
  4013  			// Check for tombstones.
  4014  			if seq&tbit != 0 {
  4015  				// If we are last mb we should consider to keep these unless the tombstone reflects a seq in this mb.
  4016  				if mb == mb.fs.lmb && seq < fseq {
  4017  					nbuf = append(nbuf, buf[index:index+rl]...)
  4018  				}
  4019  			} else {
  4020  				// Normal message here.
  4021  				nbuf = append(nbuf, buf[index:index+rl]...)
  4022  				if !firstSet {
  4023  					firstSet = true
  4024  					atomic.StoreUint64(&mb.first.seq, seq)
  4025  				}
  4026  			}
  4027  		}
  4028  		// Advance to next record.
  4029  		index += rl
  4030  	}
  4031  
  4032  	// Handle compression
  4033  	if mb.cmp != NoCompression {
  4034  		cbuf, err := mb.cmp.Compress(nbuf)
  4035  		if err != nil {
  4036  			return
  4037  		}
  4038  		meta := &CompressionInfo{
  4039  			Algorithm:    mb.cmp,
  4040  			OriginalSize: uint64(len(nbuf)),
  4041  		}
  4042  		nbuf = append(meta.MarshalMetadata(), cbuf...)
  4043  	}
  4044  
  4045  	// Check for encryption.
  4046  	if mb.bek != nil && len(nbuf) > 0 {
  4047  		// Recreate to reset counter.
  4048  		rbek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  4049  		if err != nil {
  4050  			return
  4051  		}
  4052  		rbek.XORKeyStream(nbuf, nbuf)
  4053  	}
  4054  
  4055  	// Close FDs first.
  4056  	mb.closeFDsLocked()
  4057  
  4058  	// We will write to a new file and mv/rename it in case of failure.
  4059  	mfn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(newScan, mb.index))
  4060  	<-dios
  4061  	err := os.WriteFile(mfn, nbuf, defaultFilePerms)
  4062  	dios <- struct{}{}
  4063  	if err != nil {
  4064  		os.Remove(mfn)
  4065  		return
  4066  	}
  4067  	if err := os.Rename(mfn, mb.mfn); err != nil {
  4068  		os.Remove(mfn)
  4069  		return
  4070  	}
  4071  
  4072  	// Capture the updated rbytes.
  4073  	mb.rbytes = uint64(len(nbuf))
  4074  
  4075  	// Remove any seqs from the beginning of the blk.
  4076  	for seq, nfseq := fseq, atomic.LoadUint64(&mb.first.seq); seq < nfseq; seq++ {
  4077  		mb.dmap.Delete(seq)
  4078  	}
  4079  	// Make sure we clear the cache since no longer valid.
  4080  	mb.clearCacheAndOffset()
  4081  	// If we entered with the msgs loaded make sure to reload them.
  4082  	if wasLoaded {
  4083  		mb.loadMsgsWithLock()
  4084  	}
  4085  }
  4086  
  4087  // Grab info from a slot.
  4088  // Lock should be held.
  4089  func (mb *msgBlock) slotInfo(slot int) (uint32, uint32, bool, error) {
  4090  	if mb.cache == nil || slot >= len(mb.cache.idx) {
  4091  		return 0, 0, false, errPartialCache
  4092  	}
  4093  
  4094  	bi := mb.cache.idx[slot]
  4095  	ri, hashChecked := (bi &^ hbit), (bi&hbit) != 0
  4096  
  4097  	// If this is a deleted slot return here.
  4098  	if bi == dbit {
  4099  		return 0, 0, false, errDeletedMsg
  4100  	}
  4101  
  4102  	// Determine record length
  4103  	var rl uint32
  4104  	if slot >= len(mb.cache.idx) {
  4105  		rl = mb.cache.lrl
  4106  	} else {
  4107  		// Need to account for dbit markers in idx.
  4108  		// So we will walk until we find valid idx slot to calculate rl.
  4109  		for i := 1; slot+i < len(mb.cache.idx); i++ {
  4110  			ni := mb.cache.idx[slot+i] &^ hbit
  4111  			if ni == dbit {
  4112  				continue
  4113  			}
  4114  			rl = ni - ri
  4115  			break
  4116  		}
  4117  		// check if we had all trailing dbits.
  4118  		// If so use len of cache buf minus ri.
  4119  		if rl == 0 {
  4120  			rl = uint32(len(mb.cache.buf)) - ri
  4121  		}
  4122  	}
  4123  	if rl < msgHdrSize {
  4124  		return 0, 0, false, errBadMsg
  4125  	}
  4126  	return uint32(ri), rl, hashChecked, nil
  4127  }
  4128  
  4129  func (fs *fileStore) isClosed() bool {
  4130  	fs.mu.RLock()
  4131  	closed := fs.closed
  4132  	fs.mu.RUnlock()
  4133  	return closed
  4134  }
  4135  
  4136  // Will spin up our flush loop.
  4137  func (mb *msgBlock) spinUpFlushLoop() {
  4138  	mb.mu.Lock()
  4139  	defer mb.mu.Unlock()
  4140  
  4141  	// Are we already running or closed?
  4142  	if mb.flusher || mb.closed {
  4143  		return
  4144  	}
  4145  	mb.flusher = true
  4146  	mb.fch = make(chan struct{}, 1)
  4147  	mb.qch = make(chan struct{})
  4148  	fch, qch := mb.fch, mb.qch
  4149  
  4150  	go mb.flushLoop(fch, qch)
  4151  }
  4152  
  4153  // Raw low level kicker for flush loops.
  4154  func kickFlusher(fch chan struct{}) {
  4155  	if fch != nil {
  4156  		select {
  4157  		case fch <- struct{}{}:
  4158  		default:
  4159  		}
  4160  	}
  4161  }
  4162  
  4163  // Kick flusher for this message block.
  4164  func (mb *msgBlock) kickFlusher() {
  4165  	mb.mu.RLock()
  4166  	defer mb.mu.RUnlock()
  4167  	kickFlusher(mb.fch)
  4168  }
  4169  
  4170  func (mb *msgBlock) setInFlusher() {
  4171  	mb.mu.Lock()
  4172  	mb.flusher = true
  4173  	mb.mu.Unlock()
  4174  }
  4175  
  4176  func (mb *msgBlock) clearInFlusher() {
  4177  	mb.mu.Lock()
  4178  	mb.flusher = false
  4179  	mb.mu.Unlock()
  4180  }
  4181  
  4182  // flushLoop watches for messages, index info, or recently closed msg block updates.
  4183  func (mb *msgBlock) flushLoop(fch, qch chan struct{}) {
  4184  	mb.setInFlusher()
  4185  	defer mb.clearInFlusher()
  4186  
  4187  	for {
  4188  		select {
  4189  		case <-fch:
  4190  			// If we have pending messages process them first.
  4191  			if waiting := mb.pendingWriteSize(); waiting != 0 {
  4192  				ts := 1 * time.Millisecond
  4193  				var waited time.Duration
  4194  
  4195  				for waiting < coalesceMinimum {
  4196  					time.Sleep(ts)
  4197  					select {
  4198  					case <-qch:
  4199  						return
  4200  					default:
  4201  					}
  4202  					newWaiting := mb.pendingWriteSize()
  4203  					if waited = waited + ts; waited > maxFlushWait || newWaiting <= waiting {
  4204  						break
  4205  					}
  4206  					waiting = newWaiting
  4207  					ts *= 2
  4208  				}
  4209  				mb.flushPendingMsgs()
  4210  				// Check if we are no longer the last message block. If we are
  4211  				// not we can close FDs and exit.
  4212  				mb.fs.mu.RLock()
  4213  				notLast := mb != mb.fs.lmb
  4214  				mb.fs.mu.RUnlock()
  4215  				if notLast {
  4216  					if err := mb.closeFDs(); err == nil {
  4217  						return
  4218  					}
  4219  				}
  4220  			}
  4221  		case <-qch:
  4222  			return
  4223  		}
  4224  	}
  4225  }
  4226  
  4227  // Lock should be held.
  4228  func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error {
  4229  	var le = binary.LittleEndian
  4230  	var hdr [msgHdrSize]byte
  4231  
  4232  	le.PutUint32(hdr[0:], uint32(rl))
  4233  	le.PutUint64(hdr[4:], seq|ebit)
  4234  	le.PutUint64(hdr[12:], 0)
  4235  	le.PutUint16(hdr[20:], 0)
  4236  
  4237  	// Randomize record
  4238  	data := make([]byte, rl-emptyRecordLen)
  4239  	rand.Read(data)
  4240  
  4241  	// Now write to underlying buffer.
  4242  	var b bytes.Buffer
  4243  	b.Write(hdr[:])
  4244  	b.Write(data)
  4245  
  4246  	// Calculate hash.
  4247  	mb.hh.Reset()
  4248  	mb.hh.Write(hdr[4:20])
  4249  	mb.hh.Write(data)
  4250  	checksum := mb.hh.Sum(nil)
  4251  	// Write to msg record.
  4252  	b.Write(checksum)
  4253  
  4254  	// Update both cache and disk.
  4255  	nbytes := b.Bytes()
  4256  
  4257  	// Cache
  4258  	if ri >= mb.cache.off {
  4259  		li := ri - mb.cache.off
  4260  		buf := mb.cache.buf[li : li+rl]
  4261  		copy(buf, nbytes)
  4262  	}
  4263  
  4264  	// Disk
  4265  	if mb.cache.off+mb.cache.wp > ri {
  4266  		<-dios
  4267  		mfd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms)
  4268  		dios <- struct{}{}
  4269  		if err != nil {
  4270  			return err
  4271  		}
  4272  		defer mfd.Close()
  4273  		if _, err = mfd.WriteAt(nbytes, int64(ri)); err == nil {
  4274  			mfd.Sync()
  4275  		}
  4276  		if err != nil {
  4277  			return err
  4278  		}
  4279  	}
  4280  	return nil
  4281  }
  4282  
  4283  // Truncate this message block to the storedMsg.
  4284  func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) {
  4285  	// Make sure we are loaded to process messages etc.
  4286  	if err := mb.loadMsgs(); err != nil {
  4287  		return 0, 0, err
  4288  	}
  4289  
  4290  	// Calculate new eof using slot info from our new last sm.
  4291  	ri, rl, _, err := mb.slotInfo(int(sm.seq - mb.cache.fseq))
  4292  	if err != nil {
  4293  		return 0, 0, err
  4294  	}
  4295  	// Calculate new eof.
  4296  	eof := int64(ri + rl)
  4297  
  4298  	var purged, bytes uint64
  4299  
  4300  	mb.mu.Lock()
  4301  
  4302  	checkDmap := mb.dmap.Size() > 0
  4303  	var smv StoreMsg
  4304  
  4305  	for seq := atomic.LoadUint64(&mb.last.seq); seq > sm.seq; seq-- {
  4306  		if checkDmap {
  4307  			if mb.dmap.Exists(seq) {
  4308  				// Delete and skip to next.
  4309  				mb.dmap.Delete(seq)
  4310  				checkDmap = !mb.dmap.IsEmpty()
  4311  				continue
  4312  			}
  4313  		}
  4314  		// We should have a valid msg to calculate removal stats.
  4315  		if m, err := mb.cacheLookup(seq, &smv); err == nil {
  4316  			if mb.msgs > 0 {
  4317  				rl := fileStoreMsgSize(m.subj, m.hdr, m.msg)
  4318  				mb.msgs--
  4319  				if rl > mb.bytes {
  4320  					rl = mb.bytes
  4321  				}
  4322  				mb.bytes -= rl
  4323  				mb.rbytes -= rl
  4324  				// For return accounting.
  4325  				purged++
  4326  				bytes += uint64(rl)
  4327  			}
  4328  		}
  4329  	}
  4330  
  4331  	// If the block is compressed then we have to load it into memory
  4332  	// and decompress it, truncate it and then write it back out.
  4333  	// Otherwise, truncate the file itself and close the descriptor.
  4334  	if mb.cmp != NoCompression {
  4335  		buf, err := mb.loadBlock(nil)
  4336  		if err != nil {
  4337  			return 0, 0, fmt.Errorf("failed to load block from disk: %w", err)
  4338  		}
  4339  		if mb.bek != nil && len(buf) > 0 {
  4340  			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  4341  			if err != nil {
  4342  				return 0, 0, err
  4343  			}
  4344  			mb.bek = bek
  4345  			mb.bek.XORKeyStream(buf, buf)
  4346  		}
  4347  		buf, err = mb.decompressIfNeeded(buf)
  4348  		if err != nil {
  4349  			return 0, 0, fmt.Errorf("failed to decompress block: %w", err)
  4350  		}
  4351  		buf = buf[:eof]
  4352  		copy(mb.lchk[0:], buf[:len(buf)-checksumSize])
  4353  		buf, err = mb.cmp.Compress(buf)
  4354  		if err != nil {
  4355  			return 0, 0, fmt.Errorf("failed to recompress block: %w", err)
  4356  		}
  4357  		meta := &CompressionInfo{
  4358  			Algorithm:    mb.cmp,
  4359  			OriginalSize: uint64(eof),
  4360  		}
  4361  		buf = append(meta.MarshalMetadata(), buf...)
  4362  		if mb.bek != nil && len(buf) > 0 {
  4363  			bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  4364  			if err != nil {
  4365  				return 0, 0, err
  4366  			}
  4367  			mb.bek = bek
  4368  			mb.bek.XORKeyStream(buf, buf)
  4369  		}
  4370  		n, err := mb.writeAt(buf, 0)
  4371  		if err != nil {
  4372  			return 0, 0, fmt.Errorf("failed to rewrite compressed block: %w", err)
  4373  		}
  4374  		if n != len(buf) {
  4375  			return 0, 0, fmt.Errorf("short write (%d != %d)", n, len(buf))
  4376  		}
  4377  		mb.mfd.Truncate(int64(len(buf)))
  4378  		mb.mfd.Sync()
  4379  	} else if mb.mfd != nil {
  4380  		mb.mfd.Truncate(eof)
  4381  		mb.mfd.Sync()
  4382  		// Update our checksum.
  4383  		var lchk [8]byte
  4384  		mb.mfd.ReadAt(lchk[:], eof-8)
  4385  		copy(mb.lchk[0:], lchk[:])
  4386  	} else {
  4387  		mb.mu.Unlock()
  4388  		return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index)
  4389  	}
  4390  
  4391  	// Update our last msg.
  4392  	atomic.StoreUint64(&mb.last.seq, sm.seq)
  4393  	mb.last.ts = sm.ts
  4394  
  4395  	// Clear our cache.
  4396  	mb.clearCacheAndOffset()
  4397  
  4398  	// Redo per subject info for this block.
  4399  	mb.resetPerSubjectInfo()
  4400  
  4401  	mb.mu.Unlock()
  4402  
  4403  	// Load msgs again.
  4404  	mb.loadMsgs()
  4405  
  4406  	return purged, bytes, nil
  4407  }
  4408  
  4409  // Helper to determine if the mb is empty.
  4410  func (mb *msgBlock) isEmpty() bool {
  4411  	return atomic.LoadUint64(&mb.first.seq) > atomic.LoadUint64(&mb.last.seq)
  4412  }
  4413  
  4414  // Lock should be held.
  4415  func (mb *msgBlock) selectNextFirst() {
  4416  	var seq uint64
  4417  	fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  4418  	for seq = fseq + 1; seq <= lseq; seq++ {
  4419  		if mb.dmap.Exists(seq) {
  4420  			// We will move past this so we can delete the entry.
  4421  			mb.dmap.Delete(seq)
  4422  		} else {
  4423  			break
  4424  		}
  4425  	}
  4426  	// Set new first sequence.
  4427  	atomic.StoreUint64(&mb.first.seq, seq)
  4428  
  4429  	// Check if we are empty..
  4430  	if seq > lseq {
  4431  		mb.first.ts = 0
  4432  		return
  4433  	}
  4434  
  4435  	// Need to get the timestamp.
  4436  	// We will try the cache direct and fallback if needed.
  4437  	var smv StoreMsg
  4438  	sm, _ := mb.cacheLookup(seq, &smv)
  4439  	if sm == nil {
  4440  		// Slow path, need to unlock.
  4441  		mb.mu.Unlock()
  4442  		sm, _, _ = mb.fetchMsg(seq, &smv)
  4443  		mb.mu.Lock()
  4444  	}
  4445  	if sm != nil {
  4446  		mb.first.ts = sm.ts
  4447  	} else {
  4448  		mb.first.ts = 0
  4449  	}
  4450  }
  4451  
  4452  // Select the next FirstSeq
  4453  // Lock should be held.
  4454  func (fs *fileStore) selectNextFirst() {
  4455  	if len(fs.blks) > 0 {
  4456  		mb := fs.blks[0]
  4457  		mb.mu.RLock()
  4458  		fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq)
  4459  		fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  4460  		mb.mu.RUnlock()
  4461  	} else {
  4462  		// Could not find anything, so treat like purge
  4463  		fs.state.FirstSeq = fs.state.LastSeq + 1
  4464  		fs.state.FirstTime = time.Time{}
  4465  	}
  4466  }
  4467  
  4468  // Lock should be held.
  4469  func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) {
  4470  	if td == 0 {
  4471  		td = mb.cexp + 100*time.Millisecond
  4472  	}
  4473  	if mb.ctmr == nil {
  4474  		mb.ctmr = time.AfterFunc(td, mb.expireCache)
  4475  	} else {
  4476  		mb.ctmr.Reset(td)
  4477  	}
  4478  }
  4479  
  4480  // Lock should be held.
  4481  func (mb *msgBlock) startCacheExpireTimer() {
  4482  	mb.resetCacheExpireTimer(0)
  4483  }
  4484  
  4485  // Used when we load in a message block.
  4486  // Lock should be held.
  4487  func (mb *msgBlock) clearCacheAndOffset() {
  4488  	// Reset linear scan tracker.
  4489  	mb.llseq = 0
  4490  	if mb.cache != nil {
  4491  		mb.cache.off = 0
  4492  		mb.cache.wp = 0
  4493  	}
  4494  	mb.clearCache()
  4495  }
  4496  
  4497  // Lock should be held.
  4498  func (mb *msgBlock) clearCache() {
  4499  	if mb.ctmr != nil && mb.fss == nil {
  4500  		mb.ctmr.Stop()
  4501  		mb.ctmr = nil
  4502  	}
  4503  
  4504  	if mb.cache == nil {
  4505  		return
  4506  	}
  4507  
  4508  	buf := mb.cache.buf
  4509  	if mb.cache.off == 0 {
  4510  		mb.cache = nil
  4511  	} else {
  4512  		// Clear msgs and index.
  4513  		mb.cache.buf = nil
  4514  		mb.cache.idx = nil
  4515  		mb.cache.wp = 0
  4516  	}
  4517  	recycleMsgBlockBuf(buf)
  4518  }
  4519  
  4520  // Called to possibly expire a message block cache.
  4521  func (mb *msgBlock) expireCache() {
  4522  	mb.mu.Lock()
  4523  	defer mb.mu.Unlock()
  4524  	mb.expireCacheLocked()
  4525  }
  4526  
  4527  func (mb *msgBlock) tryForceExpireCache() {
  4528  	mb.mu.Lock()
  4529  	defer mb.mu.Unlock()
  4530  	mb.tryForceExpireCacheLocked()
  4531  }
  4532  
  4533  // We will attempt to force expire this by temporarily clearing the last load time.
  4534  func (mb *msgBlock) tryForceExpireCacheLocked() {
  4535  	llts := mb.llts
  4536  	mb.llts = 0
  4537  	mb.expireCacheLocked()
  4538  	mb.llts = llts
  4539  }
  4540  
  4541  // This is for expiration of the write cache, which will be partial with fip.
  4542  // So we want to bypass the Pools here.
  4543  // Lock should be held.
  4544  func (mb *msgBlock) tryExpireWriteCache() []byte {
  4545  	if mb.cache == nil {
  4546  		return nil
  4547  	}
  4548  	lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra
  4549  	mb.lwts, mb.cache.nra = 0, true
  4550  	mb.expireCacheLocked()
  4551  	mb.lwts = lwts
  4552  	if mb.cache != nil {
  4553  		mb.cache.nra = nra
  4554  	}
  4555  	// We could check for a certain time since last load, but to be safe just reuse if no loads at all.
  4556  	if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) {
  4557  		// Clear last write time since we now are about to move on to a new lmb.
  4558  		mb.lwts = 0
  4559  		return buf[:0]
  4560  	}
  4561  	return nil
  4562  }
  4563  
  4564  // Lock should be held.
  4565  func (mb *msgBlock) expireCacheLocked() {
  4566  	if mb.cache == nil {
  4567  		if mb.ctmr != nil {
  4568  			mb.ctmr.Stop()
  4569  			mb.ctmr = nil
  4570  		}
  4571  		return
  4572  	}
  4573  
  4574  	// Can't expire if we still have pending.
  4575  	if mb.cache != nil && len(mb.cache.buf)-int(mb.cache.wp) > 0 {
  4576  		mb.resetCacheExpireTimer(mb.cexp)
  4577  		return
  4578  	}
  4579  
  4580  	// Grab timestamp to compare.
  4581  	tns := time.Now().UnixNano()
  4582  
  4583  	// For the core buffer of messages, we care about reads and writes, but not removes.
  4584  	bufts := mb.llts
  4585  	if mb.lwts > bufts {
  4586  		bufts = mb.lwts
  4587  	}
  4588  
  4589  	// Check for activity on the cache that would prevent us from expiring.
  4590  	if tns-bufts <= int64(mb.cexp) {
  4591  		mb.resetCacheExpireTimer(mb.cexp - time.Duration(tns-bufts))
  4592  		return
  4593  	}
  4594  
  4595  	// If we are here we will at least expire the core msg buffer.
  4596  	// We need to capture offset in case we do a write next before a full load.
  4597  	if mb.cache != nil {
  4598  		mb.cache.off += len(mb.cache.buf)
  4599  		if !mb.cache.nra {
  4600  			recycleMsgBlockBuf(mb.cache.buf)
  4601  		}
  4602  		mb.cache.buf = nil
  4603  		mb.cache.wp = 0
  4604  	}
  4605  
  4606  	// Check if we can clear out our idx unless under force expire.
  4607  	// fss we keep longer and expire under sync timer checks.
  4608  	mb.clearCache()
  4609  }
  4610  
  4611  func (fs *fileStore) startAgeChk() {
  4612  	if fs.ageChk == nil && fs.cfg.MaxAge != 0 {
  4613  		fs.ageChk = time.AfterFunc(fs.cfg.MaxAge, fs.expireMsgs)
  4614  	}
  4615  }
  4616  
  4617  // Lock should be held.
  4618  func (fs *fileStore) resetAgeChk(delta int64) {
  4619  	if fs.cfg.MaxAge == 0 {
  4620  		return
  4621  	}
  4622  
  4623  	fireIn := fs.cfg.MaxAge
  4624  	if delta > 0 && time.Duration(delta) < fireIn {
  4625  		if fireIn = time.Duration(delta); fireIn < time.Second {
  4626  			// Only fire at most once a second.
  4627  			// Excessive firing can effect ingest performance.
  4628  			fireIn = time.Second
  4629  		}
  4630  	}
  4631  	if fs.ageChk != nil {
  4632  		fs.ageChk.Reset(fireIn)
  4633  	} else {
  4634  		fs.ageChk = time.AfterFunc(fireIn, fs.expireMsgs)
  4635  	}
  4636  }
  4637  
  4638  // Lock should be held.
  4639  func (fs *fileStore) cancelAgeChk() {
  4640  	if fs.ageChk != nil {
  4641  		fs.ageChk.Stop()
  4642  		fs.ageChk = nil
  4643  	}
  4644  }
  4645  
  4646  // Will expire msgs that are too old.
  4647  func (fs *fileStore) expireMsgs() {
  4648  	// We need to delete one by one here and can not optimize for the time being.
  4649  	// Reason is that we need more information to adjust ack pending in consumers.
  4650  	var smv StoreMsg
  4651  	var sm *StoreMsg
  4652  	fs.mu.RLock()
  4653  	maxAge := int64(fs.cfg.MaxAge)
  4654  	minAge := time.Now().UnixNano() - maxAge
  4655  	fs.mu.RUnlock()
  4656  
  4657  	for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) {
  4658  		fs.mu.Lock()
  4659  		fs.removeMsgViaLimits(sm.seq)
  4660  		fs.mu.Unlock()
  4661  		// Recalculate in case we are expiring a bunch.
  4662  		minAge = time.Now().UnixNano() - maxAge
  4663  	}
  4664  
  4665  	fs.mu.Lock()
  4666  	defer fs.mu.Unlock()
  4667  
  4668  	// Onky cancel if no message left, not on potential lookup error that would result in sm == nil.
  4669  	if fs.state.Msgs == 0 {
  4670  		fs.cancelAgeChk()
  4671  	} else {
  4672  		if sm == nil {
  4673  			fs.resetAgeChk(0)
  4674  		} else {
  4675  			fs.resetAgeChk(sm.ts - minAge)
  4676  		}
  4677  	}
  4678  }
  4679  
  4680  // Lock should be held.
  4681  func (fs *fileStore) checkAndFlushAllBlocks() {
  4682  	for _, mb := range fs.blks {
  4683  		if mb.pendingWriteSize() > 0 {
  4684  			// Since fs lock is held need to pull this apart in case we need to rebuild state.
  4685  			mb.mu.Lock()
  4686  			ld, _ := mb.flushPendingMsgsLocked()
  4687  			mb.mu.Unlock()
  4688  			if ld != nil {
  4689  				fs.rebuildStateLocked(ld)
  4690  			}
  4691  		}
  4692  	}
  4693  }
  4694  
  4695  // This will check all the checksums on messages and report back any sequence numbers with errors.
  4696  func (fs *fileStore) checkMsgs() *LostStreamData {
  4697  	fs.mu.Lock()
  4698  	defer fs.mu.Unlock()
  4699  
  4700  	fs.checkAndFlushAllBlocks()
  4701  
  4702  	// Clear any global subject state.
  4703  	fs.psim, fs.tsl = fs.psim.Empty(), 0
  4704  
  4705  	for _, mb := range fs.blks {
  4706  		// Make sure encryption loaded if needed for the block.
  4707  		fs.loadEncryptionForMsgBlock(mb)
  4708  		// FIXME(dlc) - check tombstones here too?
  4709  		if ld, _, err := mb.rebuildState(); err != nil && ld != nil {
  4710  			// Rebuild fs state too.
  4711  			fs.rebuildStateLocked(ld)
  4712  		}
  4713  		fs.populateGlobalPerSubjectInfo(mb)
  4714  	}
  4715  
  4716  	return fs.ld
  4717  }
  4718  
  4719  // Lock should be held.
  4720  func (mb *msgBlock) enableForWriting(fip bool) error {
  4721  	if mb == nil {
  4722  		return errNoMsgBlk
  4723  	}
  4724  	if mb.mfd != nil {
  4725  		return nil
  4726  	}
  4727  	<-dios
  4728  	mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms)
  4729  	dios <- struct{}{}
  4730  	if err != nil {
  4731  		return fmt.Errorf("error opening msg block file [%q]: %v", mb.mfn, err)
  4732  	}
  4733  	mb.mfd = mfd
  4734  
  4735  	// Spin up our flusher loop if needed.
  4736  	if !fip {
  4737  		mb.spinUpFlushLoop()
  4738  	}
  4739  
  4740  	return nil
  4741  }
  4742  
  4743  // Helper function to place a delete tombstone.
  4744  func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error {
  4745  	return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true)
  4746  }
  4747  
  4748  // Will write the message record to the underlying message block.
  4749  // filestore lock will be held.
  4750  func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error {
  4751  	mb.mu.Lock()
  4752  	defer mb.mu.Unlock()
  4753  
  4754  	// Enable for writing if our mfd is not open.
  4755  	if mb.mfd == nil {
  4756  		if err := mb.enableForWriting(flush); err != nil {
  4757  			return err
  4758  		}
  4759  	}
  4760  
  4761  	// Make sure we have a cache setup.
  4762  	if mb.cache == nil {
  4763  		mb.setupWriteCache(nil)
  4764  	}
  4765  
  4766  	// Check if we are tracking per subject for our simple state.
  4767  	// Do this before changing the cache that would trigger a flush pending msgs call
  4768  	// if we needed to regenerate the per subject info.
  4769  	// Note that tombstones have no subject so will not trigger here.
  4770  	if len(subj) > 0 && !mb.noTrack {
  4771  		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
  4772  			return err
  4773  		}
  4774  		if ss := mb.fss[subj]; ss != nil {
  4775  			ss.Msgs++
  4776  			ss.Last = seq
  4777  		} else {
  4778  			mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
  4779  		}
  4780  	}
  4781  
  4782  	// Indexing
  4783  	index := len(mb.cache.buf) + int(mb.cache.off)
  4784  
  4785  	// Formats
  4786  	// Format with no header
  4787  	// total_len(4) sequence(8) timestamp(8) subj_len(2) subj msg hash(8)
  4788  	// With headers, high bit on total length will be set.
  4789  	// total_len(4) sequence(8) timestamp(8) subj_len(2) subj hdr_len(4) hdr msg hash(8)
  4790  
  4791  	// First write header, etc.
  4792  	var le = binary.LittleEndian
  4793  	var hdr [msgHdrSize]byte
  4794  
  4795  	l := uint32(rl)
  4796  	hasHeaders := len(mhdr) > 0
  4797  	if hasHeaders {
  4798  		l |= hbit
  4799  	}
  4800  
  4801  	le.PutUint32(hdr[0:], l)
  4802  	le.PutUint64(hdr[4:], seq)
  4803  	le.PutUint64(hdr[12:], uint64(ts))
  4804  	le.PutUint16(hdr[20:], uint16(len(subj)))
  4805  
  4806  	// Now write to underlying buffer.
  4807  	mb.cache.buf = append(mb.cache.buf, hdr[:]...)
  4808  	mb.cache.buf = append(mb.cache.buf, subj...)
  4809  
  4810  	if hasHeaders {
  4811  		var hlen [4]byte
  4812  		le.PutUint32(hlen[0:], uint32(len(mhdr)))
  4813  		mb.cache.buf = append(mb.cache.buf, hlen[:]...)
  4814  		mb.cache.buf = append(mb.cache.buf, mhdr...)
  4815  	}
  4816  	mb.cache.buf = append(mb.cache.buf, msg...)
  4817  
  4818  	// Calculate hash.
  4819  	mb.hh.Reset()
  4820  	mb.hh.Write(hdr[4:20])
  4821  	mb.hh.Write([]byte(subj))
  4822  	if hasHeaders {
  4823  		mb.hh.Write(mhdr)
  4824  	}
  4825  	mb.hh.Write(msg)
  4826  	checksum := mb.hh.Sum(nil)
  4827  	// Grab last checksum
  4828  	copy(mb.lchk[0:], checksum)
  4829  
  4830  	// Update write through cache.
  4831  	// Write to msg record.
  4832  	mb.cache.buf = append(mb.cache.buf, checksum...)
  4833  	mb.cache.lrl = uint32(rl)
  4834  
  4835  	// Set cache timestamp for last store.
  4836  	mb.lwts = ts
  4837  
  4838  	// Only update index and do accounting if not a delete tombstone.
  4839  	if seq&tbit == 0 {
  4840  		// Accounting, do this before stripping ebit, it is ebit aware.
  4841  		mb.updateAccounting(seq, ts, rl)
  4842  		// Strip ebit if set.
  4843  		seq = seq &^ ebit
  4844  		if mb.cache.fseq == 0 {
  4845  			mb.cache.fseq = seq
  4846  		}
  4847  		// Write index
  4848  		mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit)
  4849  	}
  4850  
  4851  	fch, werr := mb.fch, mb.werr
  4852  
  4853  	// If we should be flushing, or had a write error, do so here.
  4854  	if flush || werr != nil {
  4855  		ld, err := mb.flushPendingMsgsLocked()
  4856  		if ld != nil && mb.fs != nil {
  4857  			// We have the mb lock here, this needs the mb locks so do in its own go routine.
  4858  			go mb.fs.rebuildState(ld)
  4859  		}
  4860  		if err != nil {
  4861  			return err
  4862  		}
  4863  	} else {
  4864  		// Kick the flusher here.
  4865  		kickFlusher(fch)
  4866  	}
  4867  
  4868  	return nil
  4869  }
  4870  
  4871  // How many bytes pending to be written for this message block.
  4872  func (mb *msgBlock) pendingWriteSize() int {
  4873  	if mb == nil {
  4874  		return 0
  4875  	}
  4876  	mb.mu.RLock()
  4877  	defer mb.mu.RUnlock()
  4878  	return mb.pendingWriteSizeLocked()
  4879  }
  4880  
  4881  // How many bytes pending to be written for this message block.
  4882  func (mb *msgBlock) pendingWriteSizeLocked() int {
  4883  	if mb == nil {
  4884  		return 0
  4885  	}
  4886  	var pending int
  4887  	if !mb.closed && mb.mfd != nil && mb.cache != nil {
  4888  		pending = len(mb.cache.buf) - int(mb.cache.wp)
  4889  	}
  4890  	return pending
  4891  }
  4892  
  4893  // Try to close our FDs if we can.
  4894  func (mb *msgBlock) closeFDs() error {
  4895  	mb.mu.Lock()
  4896  	defer mb.mu.Unlock()
  4897  	return mb.closeFDsLocked()
  4898  }
  4899  
  4900  func (mb *msgBlock) closeFDsLocked() error {
  4901  	if buf, _ := mb.bytesPending(); len(buf) > 0 {
  4902  		return errPendingData
  4903  	}
  4904  	mb.closeFDsLockedNoCheck()
  4905  	return nil
  4906  }
  4907  
  4908  func (mb *msgBlock) closeFDsLockedNoCheck() {
  4909  	if mb.mfd != nil {
  4910  		mb.mfd.Close()
  4911  		mb.mfd = nil
  4912  	}
  4913  }
  4914  
  4915  // bytesPending returns the buffer to be used for writing to the underlying file.
  4916  // This marks we are in flush and will return nil if asked again until cleared.
  4917  // Lock should be held.
  4918  func (mb *msgBlock) bytesPending() ([]byte, error) {
  4919  	if mb == nil || mb.mfd == nil {
  4920  		return nil, errNoPending
  4921  	}
  4922  	if mb.cache == nil {
  4923  		return nil, errNoCache
  4924  	}
  4925  	if len(mb.cache.buf) <= mb.cache.wp {
  4926  		return nil, errNoPending
  4927  	}
  4928  	buf := mb.cache.buf[mb.cache.wp:]
  4929  	if len(buf) == 0 {
  4930  		return nil, errNoPending
  4931  	}
  4932  	return buf, nil
  4933  }
  4934  
  4935  // Returns the current blkSize including deleted msgs etc.
  4936  func (mb *msgBlock) blkSize() uint64 {
  4937  	mb.mu.RLock()
  4938  	nb := mb.rbytes
  4939  	mb.mu.RUnlock()
  4940  	return nb
  4941  }
  4942  
  4943  // Update accounting on a write msg.
  4944  // Lock should be held.
  4945  func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) {
  4946  	isDeleted := seq&ebit != 0
  4947  	if isDeleted {
  4948  		seq = seq &^ ebit
  4949  	}
  4950  
  4951  	fseq := atomic.LoadUint64(&mb.first.seq)
  4952  	if (fseq == 0 || mb.first.ts == 0) && seq >= fseq {
  4953  		atomic.StoreUint64(&mb.first.seq, seq)
  4954  		mb.first.ts = ts
  4955  	}
  4956  	// Need atomics here for selectMsgBlock speed.
  4957  	atomic.StoreUint64(&mb.last.seq, seq)
  4958  	mb.last.ts = ts
  4959  	mb.rbytes += rl
  4960  	if !isDeleted {
  4961  		mb.bytes += rl
  4962  		mb.msgs++
  4963  	}
  4964  }
  4965  
  4966  // Lock should be held.
  4967  func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg []byte) (uint64, error) {
  4968  	var err error
  4969  
  4970  	// Get size for this message.
  4971  	rl := fileStoreMsgSize(subj, hdr, msg)
  4972  	if rl&hbit != 0 {
  4973  		return 0, ErrMsgTooLarge
  4974  	}
  4975  	// Grab our current last message block.
  4976  	mb := fs.lmb
  4977  
  4978  	// Mark as dirty for stream state.
  4979  	fs.dirty++
  4980  
  4981  	if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize {
  4982  		if mb != nil && fs.fcfg.Compression != NoCompression {
  4983  			// We've now reached the end of this message block, if we want
  4984  			// to compress blocks then now's the time to do it.
  4985  			go mb.recompressOnDiskIfNeeded()
  4986  		}
  4987  		if mb, err = fs.newMsgBlockForWrite(); err != nil {
  4988  			return 0, err
  4989  		}
  4990  	}
  4991  
  4992  	// Ask msg block to store in write through cache.
  4993  	err = mb.writeMsgRecord(rl, seq, subj, hdr, msg, ts, fs.fip)
  4994  
  4995  	return rl, err
  4996  }
  4997  
  4998  func (mb *msgBlock) recompressOnDiskIfNeeded() error {
  4999  	alg := mb.fs.fcfg.Compression
  5000  	mb.mu.Lock()
  5001  	defer mb.mu.Unlock()
  5002  
  5003  	origFN := mb.mfn                    // The original message block on disk.
  5004  	tmpFN := mb.mfn + compressTmpSuffix // The compressed block will be written here.
  5005  
  5006  	// Open up the file block and read in the entire contents into memory.
  5007  	// One of two things will happen:
  5008  	// 1. The block will be compressed already and have a valid metadata
  5009  	//    header, in which case we do nothing.
  5010  	// 2. The block will be uncompressed, in which case we will compress it
  5011  	//    and then write it back out to disk, reencrypting if necessary.
  5012  	<-dios
  5013  	origBuf, err := os.ReadFile(origFN)
  5014  	dios <- struct{}{}
  5015  
  5016  	if err != nil {
  5017  		return fmt.Errorf("failed to read original block from disk: %w", err)
  5018  	}
  5019  
  5020  	// If the block is encrypted then we will need to decrypt it before
  5021  	// doing anything. We always encrypt after compressing because then the
  5022  	// compression can be as efficient as possible on the raw data, whereas
  5023  	// the encrypted ciphertext will not compress anywhere near as well.
  5024  	// The block encryption also covers the optional compression metadata.
  5025  	if mb.bek != nil && len(origBuf) > 0 {
  5026  		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  5027  		if err != nil {
  5028  			return err
  5029  		}
  5030  		mb.bek = bek
  5031  		mb.bek.XORKeyStream(origBuf, origBuf)
  5032  	}
  5033  
  5034  	meta := &CompressionInfo{}
  5035  	if _, err := meta.UnmarshalMetadata(origBuf); err != nil {
  5036  		// An error is only returned here if there's a problem with parsing
  5037  		// the metadata. If the file has no metadata at all, no error is
  5038  		// returned and the algorithm defaults to no compression.
  5039  		return fmt.Errorf("failed to read existing metadata header: %w", err)
  5040  	}
  5041  	if meta.Algorithm == alg {
  5042  		// The block is already compressed with the chosen algorithm so there
  5043  		// is nothing else to do. This is not a common case, it is here only
  5044  		// to ensure we don't do unnecessary work in case something asked us
  5045  		// to recompress an already compressed block with the same algorithm.
  5046  		return nil
  5047  	} else if alg != NoCompression {
  5048  		// The block is already compressed using some algorithm, so we need
  5049  		// to decompress the block using the existing algorithm before we can
  5050  		// recompress it with the new one.
  5051  		if origBuf, err = meta.Algorithm.Decompress(origBuf); err != nil {
  5052  			return fmt.Errorf("failed to decompress original block: %w", err)
  5053  		}
  5054  	}
  5055  
  5056  	// Rather than modifying the existing block on disk (which is a dangerous
  5057  	// operation if something goes wrong), create a new temporary file. We will
  5058  	// write out the new block here and then swap the files around afterwards
  5059  	// once everything else has succeeded correctly.
  5060  	<-dios
  5061  	tmpFD, err := os.OpenFile(tmpFN, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, defaultFilePerms)
  5062  	dios <- struct{}{}
  5063  	if err != nil {
  5064  		return fmt.Errorf("failed to create temporary file: %w", err)
  5065  	}
  5066  
  5067  	// The original buffer at this point is uncompressed, so we will now compress
  5068  	// it if needed. Note that if the selected algorithm is NoCompression, the
  5069  	// Compress function will just return the input buffer unmodified.
  5070  	cmpBuf, err := alg.Compress(origBuf)
  5071  	if err != nil {
  5072  		return fmt.Errorf("failed to compress block: %w", err)
  5073  	}
  5074  
  5075  	// We only need to write out the metadata header if compression is enabled.
  5076  	// If we're trying to uncompress the file on disk at this point, don't bother
  5077  	// writing metadata.
  5078  	if alg != NoCompression {
  5079  		meta := &CompressionInfo{
  5080  			Algorithm:    alg,
  5081  			OriginalSize: uint64(len(origBuf)),
  5082  		}
  5083  		cmpBuf = append(meta.MarshalMetadata(), cmpBuf...)
  5084  	}
  5085  
  5086  	// Re-encrypt the block if necessary.
  5087  	if mb.bek != nil && len(cmpBuf) > 0 {
  5088  		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  5089  		if err != nil {
  5090  			return err
  5091  		}
  5092  		mb.bek = bek
  5093  		mb.bek.XORKeyStream(cmpBuf, cmpBuf)
  5094  	}
  5095  
  5096  	// Write the new block data (which might be compressed or encrypted) to the
  5097  	// temporary file.
  5098  	errorCleanup := func(err error) error {
  5099  		tmpFD.Close()
  5100  		os.Remove(tmpFN)
  5101  		return err
  5102  	}
  5103  	if n, err := tmpFD.Write(cmpBuf); err != nil {
  5104  		return errorCleanup(fmt.Errorf("failed to write to temporary file: %w", err))
  5105  	} else if n != len(cmpBuf) {
  5106  		return errorCleanup(fmt.Errorf("short write to temporary file (%d != %d)", n, len(cmpBuf)))
  5107  	}
  5108  	if err := tmpFD.Sync(); err != nil {
  5109  		return errorCleanup(fmt.Errorf("failed to sync temporary file: %w", err))
  5110  	}
  5111  	if err := tmpFD.Close(); err != nil {
  5112  		return errorCleanup(fmt.Errorf("failed to close temporary file: %w", err))
  5113  	}
  5114  
  5115  	// Now replace the original file with the newly updated temp file.
  5116  	if err := os.Rename(tmpFN, origFN); err != nil {
  5117  		return fmt.Errorf("failed to move temporary file into place: %w", err)
  5118  	}
  5119  
  5120  	// Since the message block might be retained in memory, make sure the
  5121  	// compression algorithm is up-to-date, since this will be needed when
  5122  	// compacting or truncating.
  5123  	mb.cmp = alg
  5124  	return nil
  5125  }
  5126  
  5127  func (mb *msgBlock) decompressIfNeeded(buf []byte) ([]byte, error) {
  5128  	var meta CompressionInfo
  5129  	if n, err := meta.UnmarshalMetadata(buf); err != nil {
  5130  		// There was a problem parsing the metadata header of the block.
  5131  		// If there's no metadata header, an error isn't returned here,
  5132  		// we will instead just use default values of no compression.
  5133  		return nil, err
  5134  	} else if n == 0 {
  5135  		// There were no metadata bytes, so we assume the block is not
  5136  		// compressed and return it as-is.
  5137  		return buf, nil
  5138  	} else {
  5139  		// Metadata was present so it's quite likely the block contents
  5140  		// are compressed. If by any chance the metadata claims that the
  5141  		// block is uncompressed, then the input slice is just returned
  5142  		// unmodified.
  5143  		return meta.Algorithm.Decompress(buf[n:])
  5144  	}
  5145  }
  5146  
  5147  // Lock should be held.
  5148  func (mb *msgBlock) ensureRawBytesLoaded() error {
  5149  	if mb.rbytes > 0 {
  5150  		return nil
  5151  	}
  5152  	f, err := mb.openBlock()
  5153  	if err != nil {
  5154  		return err
  5155  	}
  5156  	defer f.Close()
  5157  	if fi, err := f.Stat(); fi != nil && err == nil {
  5158  		mb.rbytes = uint64(fi.Size())
  5159  	} else {
  5160  		return err
  5161  	}
  5162  	return nil
  5163  }
  5164  
  5165  // Sync msg and index files as needed. This is called from a timer.
  5166  func (fs *fileStore) syncBlocks() {
  5167  	fs.mu.RLock()
  5168  	if fs.closed {
  5169  		fs.mu.RUnlock()
  5170  		return
  5171  	}
  5172  	blks := append([]*msgBlock(nil), fs.blks...)
  5173  	lmb := fs.lmb
  5174  	syncInterval := fs.fcfg.SyncInterval
  5175  	fs.mu.RUnlock()
  5176  
  5177  	var markDirty bool
  5178  	for _, mb := range blks {
  5179  		// Do actual sync. Hold lock for consistency.
  5180  		mb.mu.Lock()
  5181  		if mb.closed {
  5182  			mb.mu.Unlock()
  5183  			continue
  5184  		}
  5185  		// See if we can close FDs due to being idle.
  5186  		if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle {
  5187  			mb.dirtyCloseWithRemove(false)
  5188  		}
  5189  		// Check our fss subject metadata.
  5190  		// If we have no activity within sync interval remove.
  5191  		if mb.fssLoaded() && mb.sinceLastActivity() > syncInterval {
  5192  			mb.fss = nil
  5193  		}
  5194  
  5195  		// Check if we should compact here as well.
  5196  		// Do not compact last mb.
  5197  		var needsCompact bool
  5198  		if mb != lmb && mb.ensureRawBytesLoaded() == nil && mb.rbytes > mb.bytes {
  5199  			needsCompact = true
  5200  			markDirty = true
  5201  		}
  5202  
  5203  		// Check if we need to sync. We will not hold lock during actual sync.
  5204  		needSync, fn := mb.needSync, mb.mfn
  5205  		if needSync {
  5206  			// Flush anything that may be pending.
  5207  			mb.flushPendingMsgsLocked()
  5208  		}
  5209  		mb.mu.Unlock()
  5210  
  5211  		// Check if we should compact here.
  5212  		// Need to hold fs lock in case we reference psim when loading in the mb.
  5213  		if needsCompact {
  5214  			fs.mu.RLock()
  5215  			mb.mu.Lock()
  5216  			mb.compact()
  5217  			mb.mu.Unlock()
  5218  			fs.mu.RUnlock()
  5219  		}
  5220  
  5221  		// Check if we need to sync.
  5222  		// This is done not holding any locks.
  5223  		if needSync {
  5224  			<-dios
  5225  			fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms)
  5226  			dios <- struct{}{}
  5227  			// If we have an fd.
  5228  			if fd != nil {
  5229  				canClear := fd.Sync() == nil
  5230  				fd.Close()
  5231  				// Only clear sync flag on success.
  5232  				if canClear {
  5233  					mb.mu.Lock()
  5234  					mb.needSync = false
  5235  					mb.mu.Unlock()
  5236  				}
  5237  			}
  5238  		}
  5239  	}
  5240  
  5241  	fs.mu.Lock()
  5242  	if fs.closed {
  5243  		fs.mu.Unlock()
  5244  		return
  5245  	}
  5246  	fs.setSyncTimer()
  5247  	fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
  5248  	syncAlways := fs.fcfg.SyncAlways
  5249  	if markDirty {
  5250  		fs.dirty++
  5251  	}
  5252  	fs.mu.Unlock()
  5253  
  5254  	// Sync state file if we are not running with sync always.
  5255  	if !syncAlways {
  5256  		<-dios
  5257  		fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms)
  5258  		dios <- struct{}{}
  5259  		if fd != nil {
  5260  			fd.Sync()
  5261  			fd.Close()
  5262  		}
  5263  	}
  5264  }
  5265  
  5266  // Select the message block where this message should be found.
  5267  // Return nil if not in the set.
  5268  // Read lock should be held.
  5269  func (fs *fileStore) selectMsgBlock(seq uint64) *msgBlock {
  5270  	_, mb := fs.selectMsgBlockWithIndex(seq)
  5271  	return mb
  5272  }
  5273  
  5274  // Lock should be held.
  5275  func (fs *fileStore) selectMsgBlockWithIndex(seq uint64) (int, *msgBlock) {
  5276  	// Check for out of range.
  5277  	if seq < fs.state.FirstSeq || seq > fs.state.LastSeq {
  5278  		return -1, nil
  5279  	}
  5280  
  5281  	const linearThresh = 32
  5282  	nb := len(fs.blks) - 1
  5283  
  5284  	if nb < linearThresh {
  5285  		for i, mb := range fs.blks {
  5286  			if seq <= atomic.LoadUint64(&mb.last.seq) {
  5287  				return i, mb
  5288  			}
  5289  		}
  5290  		return -1, nil
  5291  	}
  5292  
  5293  	// Do traditional binary search here since we know the blocks are sorted by sequence first and last.
  5294  	for low, high, mid := 0, nb, nb/2; low <= high; mid = (low + high) / 2 {
  5295  		mb := fs.blks[mid]
  5296  		// Right now these atomic loads do not factor in, so fine to leave. Was considering
  5297  		// uplifting these to fs scope to avoid atomic load but not needed.
  5298  		first, last := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  5299  		if seq > last {
  5300  			low = mid + 1
  5301  		} else if seq < first {
  5302  			// A message block's first sequence can change here meaning we could find a gap.
  5303  			// We want to behave like above, which if inclusive (we check at start) should
  5304  			// always return an index and a valid mb.
  5305  			// If we have a gap then our seq would be > fs.blks[mid-1].last.seq
  5306  			if mid == 0 || seq > atomic.LoadUint64(&fs.blks[mid-1].last.seq) {
  5307  				return mid, mb
  5308  			}
  5309  			high = mid - 1
  5310  		} else {
  5311  			return mid, mb
  5312  		}
  5313  	}
  5314  
  5315  	return -1, nil
  5316  }
  5317  
  5318  // Select the message block where this message should be found.
  5319  // Return nil if not in the set.
  5320  func (fs *fileStore) selectMsgBlockForStart(minTime time.Time) *msgBlock {
  5321  	fs.mu.RLock()
  5322  	defer fs.mu.RUnlock()
  5323  
  5324  	t := minTime.UnixNano()
  5325  	for _, mb := range fs.blks {
  5326  		mb.mu.RLock()
  5327  		found := t <= mb.last.ts
  5328  		mb.mu.RUnlock()
  5329  		if found {
  5330  			return mb
  5331  		}
  5332  	}
  5333  	return nil
  5334  }
  5335  
  5336  // Index a raw msg buffer.
  5337  // Lock should be held.
  5338  func (mb *msgBlock) indexCacheBuf(buf []byte) error {
  5339  	var le = binary.LittleEndian
  5340  
  5341  	var fseq uint64
  5342  	var idx []uint32
  5343  	var index uint32
  5344  
  5345  	mbFirstSeq := atomic.LoadUint64(&mb.first.seq)
  5346  	mbLastSeq := atomic.LoadUint64(&mb.last.seq)
  5347  
  5348  	// Capture beginning size of dmap.
  5349  	dms := uint64(mb.dmap.Size())
  5350  	idxSz := mbLastSeq - mbFirstSeq + 1
  5351  
  5352  	if mb.cache == nil {
  5353  		// Approximation, may adjust below.
  5354  		fseq = mbFirstSeq
  5355  		idx = make([]uint32, 0, idxSz)
  5356  		mb.cache = &cache{}
  5357  	} else {
  5358  		fseq = mb.cache.fseq
  5359  		idx = mb.cache.idx
  5360  		if len(idx) == 0 {
  5361  			idx = make([]uint32, 0, idxSz)
  5362  		}
  5363  		index = uint32(len(mb.cache.buf))
  5364  		buf = append(mb.cache.buf, buf...)
  5365  	}
  5366  
  5367  	// Create FSS if we should track.
  5368  	var popFss bool
  5369  	if mb.fssNotLoaded() {
  5370  		mb.fss = make(map[string]*SimpleState)
  5371  		popFss = true
  5372  	}
  5373  
  5374  	lbuf := uint32(len(buf))
  5375  	var seq uint64
  5376  	for index < lbuf {
  5377  		if index+msgHdrSize > lbuf {
  5378  			return errCorruptState
  5379  		}
  5380  		hdr := buf[index : index+msgHdrSize]
  5381  		rl, slen := le.Uint32(hdr[0:]), int(le.Uint16(hdr[20:]))
  5382  		seq = le.Uint64(hdr[4:])
  5383  
  5384  		// Clear any headers bit that could be set.
  5385  		rl &^= hbit
  5386  		dlen := int(rl) - msgHdrSize
  5387  
  5388  		// Do some quick sanity checks here.
  5389  		if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh {
  5390  			// This means something is off.
  5391  			// TODO(dlc) - Add into bad list?
  5392  			return errCorruptState
  5393  		}
  5394  
  5395  		// Check for tombstones which we can skip in terms of indexing.
  5396  		if seq&tbit != 0 {
  5397  			index += rl
  5398  			continue
  5399  		}
  5400  
  5401  		// Clear any erase bits.
  5402  		erased := seq&ebit != 0
  5403  		seq = seq &^ ebit
  5404  
  5405  		// We defer checksum checks to individual msg cache lookups to amortorize costs and
  5406  		// not introduce latency for first message from a newly loaded block.
  5407  		if seq >= mbFirstSeq {
  5408  			// Track that we do not have holes.
  5409  			if slot := int(seq - mbFirstSeq); slot != len(idx) {
  5410  				// If we have a hole fill it.
  5411  				for dseq := mbFirstSeq + uint64(len(idx)); dseq < seq; dseq++ {
  5412  					idx = append(idx, dbit)
  5413  					if dms == 0 {
  5414  						mb.dmap.Insert(dseq)
  5415  					}
  5416  				}
  5417  			}
  5418  			// Add to our index.
  5419  			idx = append(idx, index)
  5420  			mb.cache.lrl = uint32(rl)
  5421  			// Adjust if we guessed wrong.
  5422  			if seq != 0 && seq < fseq {
  5423  				fseq = seq
  5424  			}
  5425  
  5426  			// Make sure our dmap has this entry if it was erased.
  5427  			if erased && dms == 0 {
  5428  				mb.dmap.Insert(seq)
  5429  			}
  5430  
  5431  			// Handle FSS inline here.
  5432  			if popFss && slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) {
  5433  				bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)]
  5434  				if ss := mb.fss[string(bsubj)]; ss != nil {
  5435  					ss.Msgs++
  5436  					ss.Last = seq
  5437  				} else {
  5438  					mb.fss[string(bsubj)] = &SimpleState{
  5439  						Msgs:  1,
  5440  						First: seq,
  5441  						Last:  seq,
  5442  					}
  5443  				}
  5444  			}
  5445  		}
  5446  		index += rl
  5447  	}
  5448  
  5449  	// Track holes at the end of the block, these would be missed in the
  5450  	// earlier loop if we've ran out of block file to look at, but should
  5451  	// be easily noticed because the seq will be below the last seq from
  5452  	// the index.
  5453  	if seq > 0 && seq < mbLastSeq {
  5454  		for dseq := seq; dseq < mbLastSeq; dseq++ {
  5455  			idx = append(idx, dbit)
  5456  			if dms == 0 {
  5457  				mb.dmap.Insert(dseq)
  5458  			}
  5459  		}
  5460  	}
  5461  
  5462  	mb.cache.buf = buf
  5463  	mb.cache.idx = idx
  5464  	mb.cache.fseq = fseq
  5465  	mb.cache.wp += int(lbuf)
  5466  
  5467  	return nil
  5468  }
  5469  
  5470  // flushPendingMsgs writes out any messages for this message block.
  5471  func (mb *msgBlock) flushPendingMsgs() error {
  5472  	mb.mu.Lock()
  5473  	fsLostData, err := mb.flushPendingMsgsLocked()
  5474  	fs := mb.fs
  5475  	mb.mu.Unlock()
  5476  
  5477  	// Signals us that we need to rebuild filestore state.
  5478  	if fsLostData != nil && fs != nil {
  5479  		// Rebuild fs state too.
  5480  		fs.rebuildState(fsLostData)
  5481  	}
  5482  	return err
  5483  }
  5484  
  5485  // Write function for actual data.
  5486  // mb.mfd should not be nil.
  5487  // Lock should held.
  5488  func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) {
  5489  	// Used to mock write failures.
  5490  	if mb.mockWriteErr {
  5491  		// Reset on trip.
  5492  		mb.mockWriteErr = false
  5493  		return 0, errors.New("mock write error")
  5494  	}
  5495  	return mb.mfd.WriteAt(buf, woff)
  5496  }
  5497  
  5498  // flushPendingMsgsLocked writes out any messages for this message block.
  5499  // Lock should be held.
  5500  func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) {
  5501  	// Signals us that we need to rebuild filestore state.
  5502  	var fsLostData *LostStreamData
  5503  
  5504  	if mb.cache == nil || mb.mfd == nil {
  5505  		return nil, nil
  5506  	}
  5507  
  5508  	buf, err := mb.bytesPending()
  5509  	// If we got an error back return here.
  5510  	if err != nil {
  5511  		// No pending data to be written is not an error.
  5512  		if err == errNoPending || err == errNoCache {
  5513  			err = nil
  5514  		}
  5515  		return nil, err
  5516  	}
  5517  
  5518  	woff := int64(mb.cache.off + mb.cache.wp)
  5519  	lob := len(buf)
  5520  
  5521  	// TODO(dlc) - Normally we would not hold the lock across I/O so we can improve performance.
  5522  	// We will hold to stabilize the code base, as we have had a few anomalies with partial cache errors
  5523  	// under heavy load.
  5524  
  5525  	// Check if we need to encrypt.
  5526  	if mb.bek != nil && lob > 0 {
  5527  		// Need to leave original alone.
  5528  		var dst []byte
  5529  		if lob <= defaultLargeBlockSize {
  5530  			dst = getMsgBlockBuf(lob)[:lob]
  5531  		} else {
  5532  			dst = make([]byte, lob)
  5533  		}
  5534  		mb.bek.XORKeyStream(dst, buf)
  5535  		buf = dst
  5536  	}
  5537  
  5538  	// Append new data to the message block file.
  5539  	for lbb := lob; lbb > 0; lbb = len(buf) {
  5540  		n, err := mb.writeAt(buf, woff)
  5541  		if err != nil {
  5542  			mb.dirtyCloseWithRemove(false)
  5543  			ld, _, _ := mb.rebuildStateLocked()
  5544  			mb.werr = err
  5545  			return ld, err
  5546  		}
  5547  		// Update our write offset.
  5548  		woff += int64(n)
  5549  		// Partial write.
  5550  		if n != lbb {
  5551  			buf = buf[n:]
  5552  		} else {
  5553  			// Done.
  5554  			break
  5555  		}
  5556  	}
  5557  
  5558  	// Clear any error.
  5559  	mb.werr = nil
  5560  
  5561  	// Cache may be gone.
  5562  	if mb.cache == nil || mb.mfd == nil {
  5563  		return fsLostData, mb.werr
  5564  	}
  5565  
  5566  	// Check if we are in sync always mode.
  5567  	if mb.syncAlways {
  5568  		mb.mfd.Sync()
  5569  	} else {
  5570  		mb.needSync = true
  5571  	}
  5572  
  5573  	// Check for additional writes while we were writing to the disk.
  5574  	moreBytes := len(mb.cache.buf) - mb.cache.wp - lob
  5575  
  5576  	// Decide what we want to do with the buffer in hand. If we have load interest
  5577  	// we will hold onto the whole thing, otherwise empty the buffer, possibly reusing it.
  5578  	if ts := time.Now().UnixNano(); ts < mb.llts || (ts-mb.llts) <= int64(mb.cexp) {
  5579  		mb.cache.wp += lob
  5580  	} else {
  5581  		if cap(mb.cache.buf) <= maxBufReuse {
  5582  			buf = mb.cache.buf[:0]
  5583  		} else {
  5584  			recycleMsgBlockBuf(mb.cache.buf)
  5585  			buf = nil
  5586  		}
  5587  		if moreBytes > 0 {
  5588  			nbuf := mb.cache.buf[len(mb.cache.buf)-moreBytes:]
  5589  			if moreBytes > (len(mb.cache.buf)/4*3) && cap(nbuf) <= maxBufReuse {
  5590  				buf = nbuf
  5591  			} else {
  5592  				buf = append(buf, nbuf...)
  5593  			}
  5594  		}
  5595  		// Update our cache offset.
  5596  		mb.cache.off = int(woff)
  5597  		// Reset write pointer.
  5598  		mb.cache.wp = 0
  5599  		// Place buffer back in the cache structure.
  5600  		mb.cache.buf = buf
  5601  		// Mark fseq to 0
  5602  		mb.cache.fseq = 0
  5603  	}
  5604  
  5605  	return fsLostData, mb.werr
  5606  }
  5607  
  5608  // Lock should be held.
  5609  func (mb *msgBlock) clearLoading() {
  5610  	mb.loading = false
  5611  }
  5612  
  5613  // Will load msgs from disk.
  5614  func (mb *msgBlock) loadMsgs() error {
  5615  	// We hold the lock here the whole time by design.
  5616  	mb.mu.Lock()
  5617  	defer mb.mu.Unlock()
  5618  	return mb.loadMsgsWithLock()
  5619  }
  5620  
  5621  // Lock should be held.
  5622  func (mb *msgBlock) cacheAlreadyLoaded() bool {
  5623  	if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 {
  5624  		return false
  5625  	}
  5626  	numEntries := mb.msgs + uint64(mb.dmap.Size()) + (atomic.LoadUint64(&mb.first.seq) - mb.cache.fseq)
  5627  	return numEntries == uint64(len(mb.cache.idx))
  5628  }
  5629  
  5630  // Lock should be held.
  5631  func (mb *msgBlock) cacheNotLoaded() bool {
  5632  	return !mb.cacheAlreadyLoaded()
  5633  }
  5634  
  5635  // Report if our fss is not loaded.
  5636  // Lock should be held.
  5637  func (mb *msgBlock) fssNotLoaded() bool {
  5638  	return mb.fss == nil && !mb.noTrack
  5639  }
  5640  
  5641  // Report if we have our fss loaded.
  5642  // Lock should be held.
  5643  func (mb *msgBlock) fssLoaded() bool {
  5644  	return mb.fss != nil
  5645  }
  5646  
  5647  // Wrap openBlock for the gated semaphore processing.
  5648  // Lock should be held
  5649  func (mb *msgBlock) openBlock() (*os.File, error) {
  5650  	// Gate with concurrent IO semaphore.
  5651  	<-dios
  5652  	f, err := os.Open(mb.mfn)
  5653  	dios <- struct{}{}
  5654  	return f, err
  5655  }
  5656  
  5657  // Used to load in the block contents.
  5658  // Lock should be held and all conditionals satisfied prior.
  5659  func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) {
  5660  	var f *os.File
  5661  	// Re-use if we have mfd open.
  5662  	if mb.mfd != nil {
  5663  		f = mb.mfd
  5664  		if n, err := f.Seek(0, 0); n != 0 || err != nil {
  5665  			f = nil
  5666  			mb.closeFDsLockedNoCheck()
  5667  		}
  5668  	}
  5669  	if f == nil {
  5670  		var err error
  5671  		f, err = mb.openBlock()
  5672  		if err != nil {
  5673  			if os.IsNotExist(err) {
  5674  				err = errNoBlkData
  5675  			}
  5676  			return nil, err
  5677  		}
  5678  		defer f.Close()
  5679  	}
  5680  
  5681  	var sz int
  5682  	if info, err := f.Stat(); err == nil {
  5683  		sz64 := info.Size()
  5684  		if int64(int(sz64)) == sz64 {
  5685  			sz = int(sz64)
  5686  		} else {
  5687  			return nil, errMsgBlkTooBig
  5688  		}
  5689  	}
  5690  
  5691  	if buf == nil {
  5692  		buf = getMsgBlockBuf(sz)
  5693  		if sz > cap(buf) {
  5694  			// We know we will make a new one so just recycle for now.
  5695  			recycleMsgBlockBuf(buf)
  5696  			buf = nil
  5697  		}
  5698  	}
  5699  
  5700  	if sz > cap(buf) {
  5701  		buf = make([]byte, sz)
  5702  	} else {
  5703  		buf = buf[:sz]
  5704  	}
  5705  
  5706  	<-dios
  5707  	n, err := io.ReadFull(f, buf)
  5708  	dios <- struct{}{}
  5709  	// On success capture raw bytes size.
  5710  	if err == nil {
  5711  		mb.rbytes = uint64(n)
  5712  	}
  5713  	return buf[:n], err
  5714  }
  5715  
  5716  // Lock should be held.
  5717  func (mb *msgBlock) loadMsgsWithLock() error {
  5718  	// Check for encryption, we do not load keys on startup anymore so might need to load them here.
  5719  	if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) {
  5720  		if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil {
  5721  			return err
  5722  		}
  5723  	}
  5724  
  5725  	// Check to see if we are loading already.
  5726  	if mb.loading {
  5727  		return nil
  5728  	}
  5729  
  5730  	// Set loading status.
  5731  	mb.loading = true
  5732  	defer mb.clearLoading()
  5733  
  5734  	var nchecks int
  5735  
  5736  checkCache:
  5737  	nchecks++
  5738  	if nchecks > 8 {
  5739  		return errCorruptState
  5740  	}
  5741  
  5742  	// Check to see if we have a full cache.
  5743  	if mb.cacheAlreadyLoaded() {
  5744  		return nil
  5745  	}
  5746  
  5747  	mb.llts = time.Now().UnixNano()
  5748  
  5749  	// FIXME(dlc) - We could be smarter here.
  5750  	if buf, _ := mb.bytesPending(); len(buf) > 0 {
  5751  		ld, err := mb.flushPendingMsgsLocked()
  5752  		if ld != nil && mb.fs != nil {
  5753  			// We do not know if fs is locked or not at this point.
  5754  			// This should be an exceptional condition so do so in Go routine.
  5755  			go mb.fs.rebuildState(ld)
  5756  		}
  5757  		if err != nil {
  5758  			return err
  5759  		}
  5760  		goto checkCache
  5761  	}
  5762  
  5763  	// Load in the whole block.
  5764  	// We want to hold the mb lock here to avoid any changes to state.
  5765  	buf, err := mb.loadBlock(nil)
  5766  	if err != nil {
  5767  		if err == errNoBlkData {
  5768  			if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil {
  5769  				// Rebuild fs state too.
  5770  				go mb.fs.rebuildState(ld)
  5771  			}
  5772  		}
  5773  		return err
  5774  	}
  5775  
  5776  	// Reset the cache since we just read everything in.
  5777  	// Make sure this is cleared in case we had a partial when we started.
  5778  	mb.clearCacheAndOffset()
  5779  
  5780  	// Check if we need to decrypt.
  5781  	if mb.bek != nil && len(buf) > 0 {
  5782  		bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
  5783  		if err != nil {
  5784  			return err
  5785  		}
  5786  		mb.bek = bek
  5787  		mb.bek.XORKeyStream(buf, buf)
  5788  	}
  5789  
  5790  	// Check for compression.
  5791  	if buf, err = mb.decompressIfNeeded(buf); err != nil {
  5792  		return err
  5793  	}
  5794  
  5795  	if err := mb.indexCacheBuf(buf); err != nil {
  5796  		if err == errCorruptState {
  5797  			var ld *LostStreamData
  5798  			if ld, _, err = mb.rebuildStateLocked(); ld != nil {
  5799  				// We do not know if fs is locked or not at this point.
  5800  				// This should be an exceptional condition so do so in Go routine.
  5801  				go mb.fs.rebuildState(ld)
  5802  			}
  5803  		}
  5804  		if err != nil {
  5805  			return err
  5806  		}
  5807  		goto checkCache
  5808  	}
  5809  
  5810  	if len(buf) > 0 {
  5811  		mb.cloads++
  5812  		mb.startCacheExpireTimer()
  5813  	}
  5814  
  5815  	return nil
  5816  }
  5817  
  5818  // Fetch a message from this block, possibly reading in and caching the messages.
  5819  // We assume the block was selected and is correct, so we do not do range checks.
  5820  func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) {
  5821  	mb.mu.Lock()
  5822  	defer mb.mu.Unlock()
  5823  
  5824  	fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  5825  	if seq < fseq || seq > lseq {
  5826  		return nil, false, ErrStoreMsgNotFound
  5827  	}
  5828  
  5829  	// See if we can short circuit if we already know msg deleted.
  5830  	if mb.dmap.Exists(seq) {
  5831  		// Update for scanning like cacheLookup would have.
  5832  		llseq := mb.llseq
  5833  		if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 {
  5834  			mb.llseq = seq
  5835  		}
  5836  		expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1)
  5837  		return nil, expireOk, errDeletedMsg
  5838  	}
  5839  
  5840  	if mb.cacheNotLoaded() {
  5841  		if err := mb.loadMsgsWithLock(); err != nil {
  5842  			return nil, false, err
  5843  		}
  5844  	}
  5845  	llseq := mb.llseq
  5846  
  5847  	fsm, err := mb.cacheLookup(seq, sm)
  5848  	if err != nil {
  5849  		return nil, false, err
  5850  	}
  5851  	expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1)
  5852  	return fsm, expireOk, err
  5853  }
  5854  
  5855  var (
  5856  	errNoCache       = errors.New("no message cache")
  5857  	errBadMsg        = errors.New("malformed or corrupt message")
  5858  	errDeletedMsg    = errors.New("deleted message")
  5859  	errPartialCache  = errors.New("partial cache")
  5860  	errNoPending     = errors.New("message block does not have pending data")
  5861  	errNotReadable   = errors.New("storage directory not readable")
  5862  	errCorruptState  = errors.New("corrupt state file")
  5863  	errPriorState    = errors.New("prior state file")
  5864  	errPendingData   = errors.New("pending data still present")
  5865  	errNoEncryption  = errors.New("encryption not enabled")
  5866  	errBadKeySize    = errors.New("encryption bad key size")
  5867  	errNoMsgBlk      = errors.New("no message block")
  5868  	errMsgBlkTooBig  = errors.New("message block size exceeded int capacity")
  5869  	errUnknownCipher = errors.New("unknown cipher")
  5870  	errNoMainKey     = errors.New("encrypted store encountered with no main key")
  5871  	errNoBlkData     = errors.New("message block data missing")
  5872  )
  5873  
  5874  const (
  5875  	// Used for marking messages that have had their checksums checked.
  5876  	// Used to signal a message record with headers.
  5877  	hbit = 1 << 31
  5878  	// Used for marking erased messages sequences.
  5879  	ebit = 1 << 63
  5880  	// Used for marking tombstone sequences.
  5881  	tbit = 1 << 62
  5882  	// Used to mark an index as deleted and non-existent.
  5883  	dbit = 1 << 30
  5884  )
  5885  
  5886  // Will do a lookup from cache.
  5887  // Lock should be held.
  5888  func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
  5889  	if seq < atomic.LoadUint64(&mb.first.seq) || seq > atomic.LoadUint64(&mb.last.seq) {
  5890  		return nil, ErrStoreMsgNotFound
  5891  	}
  5892  
  5893  	// The llseq signals us when we can expire a cache at the end of a linear scan.
  5894  	// We want to only update when we know the last reads (multiple consumers) are sequential.
  5895  	// We want to account for forwards and backwards linear scans.
  5896  	if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 {
  5897  		mb.llseq = seq
  5898  	}
  5899  
  5900  	// If we have a delete map check it.
  5901  	if mb.dmap.Exists(seq) {
  5902  		mb.llts = time.Now().UnixNano()
  5903  		return nil, errDeletedMsg
  5904  	}
  5905  
  5906  	// Detect no cache loaded.
  5907  	if mb.cache == nil || mb.cache.fseq == 0 || len(mb.cache.idx) == 0 || len(mb.cache.buf) == 0 {
  5908  		return nil, errNoCache
  5909  	}
  5910  	// Check partial cache status.
  5911  	if seq < mb.cache.fseq {
  5912  		return nil, errPartialCache
  5913  	}
  5914  
  5915  	bi, _, hashChecked, err := mb.slotInfo(int(seq - mb.cache.fseq))
  5916  	if err != nil {
  5917  		return nil, err
  5918  	}
  5919  
  5920  	// Update cache activity.
  5921  	mb.llts = time.Now().UnixNano()
  5922  
  5923  	li := int(bi) - mb.cache.off
  5924  	if li >= len(mb.cache.buf) {
  5925  		return nil, errPartialCache
  5926  	}
  5927  	buf := mb.cache.buf[li:]
  5928  
  5929  	// We use the high bit to denote we have already checked the checksum.
  5930  	var hh hash.Hash64
  5931  	if !hashChecked {
  5932  		hh = mb.hh // This will force the hash check in msgFromBuf.
  5933  	}
  5934  
  5935  	// Parse from the raw buffer.
  5936  	fsm, err := mb.msgFromBuf(buf, sm, hh)
  5937  	if err != nil || fsm == nil {
  5938  		return nil, err
  5939  	}
  5940  
  5941  	// Deleted messages that are decoded return a 0 for sequence.
  5942  	if fsm.seq == 0 {
  5943  		return nil, errDeletedMsg
  5944  	}
  5945  
  5946  	if seq != fsm.seq {
  5947  		recycleMsgBlockBuf(mb.cache.buf)
  5948  		mb.cache.buf = nil
  5949  		return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq)
  5950  	}
  5951  
  5952  	// Clear the check bit here after we know all is good.
  5953  	if !hashChecked {
  5954  		mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit)
  5955  	}
  5956  
  5957  	return fsm, nil
  5958  }
  5959  
  5960  // Used when we are checking if discarding a message due to max msgs per subject will give us
  5961  // enough room for a max bytes condition.
  5962  // Lock should be already held.
  5963  func (fs *fileStore) sizeForSeq(seq uint64) int {
  5964  	if seq == 0 {
  5965  		return 0
  5966  	}
  5967  	var smv StoreMsg
  5968  	if mb := fs.selectMsgBlock(seq); mb != nil {
  5969  		if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil {
  5970  			return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg))
  5971  		}
  5972  	}
  5973  	return 0
  5974  }
  5975  
  5976  // Will return message for the given sequence number.
  5977  func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
  5978  	// TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will
  5979  	// be stalled. Need another lock if want to happen in parallel.
  5980  	fs.mu.RLock()
  5981  	if fs.closed {
  5982  		fs.mu.RUnlock()
  5983  		return nil, ErrStoreClosed
  5984  	}
  5985  	// Indicates we want first msg.
  5986  	if seq == 0 {
  5987  		seq = fs.state.FirstSeq
  5988  	}
  5989  	// Make sure to snapshot here.
  5990  	mb, lseq := fs.selectMsgBlock(seq), fs.state.LastSeq
  5991  	fs.mu.RUnlock()
  5992  
  5993  	if mb == nil {
  5994  		var err = ErrStoreEOF
  5995  		if seq <= lseq {
  5996  			err = ErrStoreMsgNotFound
  5997  		}
  5998  		return nil, err
  5999  	}
  6000  
  6001  	fsm, expireOk, err := mb.fetchMsg(seq, sm)
  6002  	if err != nil {
  6003  		return nil, err
  6004  	}
  6005  
  6006  	// We detected a linear scan and access to the last message.
  6007  	// If we are not the last message block we can try to expire the cache.
  6008  	if expireOk {
  6009  		mb.tryForceExpireCache()
  6010  	}
  6011  
  6012  	return fsm, nil
  6013  }
  6014  
  6015  // Internal function to return msg parts from a raw buffer.
  6016  // Lock should be held.
  6017  func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) {
  6018  	if len(buf) < emptyRecordLen {
  6019  		return nil, errBadMsg
  6020  	}
  6021  	var le = binary.LittleEndian
  6022  
  6023  	hdr := buf[:msgHdrSize]
  6024  	rl := le.Uint32(hdr[0:])
  6025  	hasHeaders := rl&hbit != 0
  6026  	rl &^= hbit // clear header bit
  6027  	dlen := int(rl) - msgHdrSize
  6028  	slen := int(le.Uint16(hdr[20:]))
  6029  	// Simple sanity check.
  6030  	if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || int(rl) > len(buf) {
  6031  		return nil, errBadMsg
  6032  	}
  6033  	data := buf[msgHdrSize : msgHdrSize+dlen]
  6034  	// Do checksum tests here if requested.
  6035  	if hh != nil {
  6036  		hh.Reset()
  6037  		hh.Write(hdr[4:20])
  6038  		hh.Write(data[:slen])
  6039  		if hasHeaders {
  6040  			hh.Write(data[slen+4 : dlen-recordHashSize])
  6041  		} else {
  6042  			hh.Write(data[slen : dlen-recordHashSize])
  6043  		}
  6044  		if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) {
  6045  			return nil, errBadMsg
  6046  		}
  6047  	}
  6048  	seq := le.Uint64(hdr[4:])
  6049  	if seq&ebit != 0 {
  6050  		seq = 0
  6051  	}
  6052  	ts := int64(le.Uint64(hdr[12:]))
  6053  
  6054  	// Create a StoreMsg if needed.
  6055  	if sm == nil {
  6056  		sm = new(StoreMsg)
  6057  	} else {
  6058  		sm.clear()
  6059  	}
  6060  	// To recycle the large blocks we can never pass back a reference, so need to copy for the upper
  6061  	// layers and for us to be safe to expire, and recycle, the large msgBlocks.
  6062  	end := dlen - 8
  6063  
  6064  	if hasHeaders {
  6065  		hl := le.Uint32(data[slen:])
  6066  		bi := slen + 4
  6067  		li := bi + int(hl)
  6068  		sm.buf = append(sm.buf, data[bi:end]...)
  6069  		li, end = li-bi, end-bi
  6070  		sm.hdr = sm.buf[0:li:li]
  6071  		sm.msg = sm.buf[li:end]
  6072  	} else {
  6073  		sm.buf = append(sm.buf, data[slen:end]...)
  6074  		sm.msg = sm.buf[0 : end-slen]
  6075  	}
  6076  	sm.seq, sm.ts = seq, ts
  6077  	if slen > 0 {
  6078  		// Make a copy since sm.subj lifetime may last longer.
  6079  		sm.subj = string(data[:slen])
  6080  	}
  6081  
  6082  	return sm, nil
  6083  }
  6084  
  6085  // LoadMsg will lookup the message by sequence number and return it if found.
  6086  func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) {
  6087  	return fs.msgForSeq(seq, sm)
  6088  }
  6089  
  6090  // loadLast will load the last message for a subject. Subject should be non empty and not ">".
  6091  func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) {
  6092  	fs.mu.RLock()
  6093  	defer fs.mu.RUnlock()
  6094  
  6095  	if fs.closed || fs.lmb == nil {
  6096  		return nil, ErrStoreClosed
  6097  	}
  6098  
  6099  	if len(fs.blks) == 0 {
  6100  		return nil, ErrStoreMsgNotFound
  6101  	}
  6102  
  6103  	start, stop := fs.lmb.index, fs.blks[0].index
  6104  	wc := subjectHasWildcard(subj)
  6105  	// If literal subject check for presence.
  6106  	if !wc {
  6107  		if info, ok := fs.psim.Find(stringToBytes(subj)); !ok {
  6108  			return nil, ErrStoreMsgNotFound
  6109  		} else {
  6110  			start, stop = info.lblk, info.fblk
  6111  		}
  6112  	}
  6113  
  6114  	// Walk blocks backwards.
  6115  	for i := start; i >= stop; i-- {
  6116  		mb := fs.bim[i]
  6117  		if mb == nil {
  6118  			continue
  6119  		}
  6120  		mb.mu.Lock()
  6121  		if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
  6122  			mb.mu.Unlock()
  6123  			return nil, err
  6124  		}
  6125  		var l uint64
  6126  		// Optimize if subject is not a wildcard.
  6127  		if !wc {
  6128  			if ss := mb.fss[subj]; ss != nil {
  6129  				l = ss.Last
  6130  			}
  6131  		}
  6132  		if l == 0 {
  6133  			_, _, l = mb.filteredPendingLocked(subj, wc, atomic.LoadUint64(&mb.first.seq))
  6134  		}
  6135  		if l > 0 {
  6136  			if mb.cacheNotLoaded() {
  6137  				if err := mb.loadMsgsWithLock(); err != nil {
  6138  					mb.mu.Unlock()
  6139  					return nil, err
  6140  				}
  6141  			}
  6142  			lsm, err = mb.cacheLookup(l, sm)
  6143  		}
  6144  		mb.mu.Unlock()
  6145  		if l > 0 {
  6146  			break
  6147  		}
  6148  	}
  6149  	return lsm, err
  6150  }
  6151  
  6152  // LoadLastMsg will return the last message we have that matches a given subject.
  6153  // The subject can be a wildcard.
  6154  func (fs *fileStore) LoadLastMsg(subject string, smv *StoreMsg) (sm *StoreMsg, err error) {
  6155  	if subject == _EMPTY_ || subject == fwcs {
  6156  		sm, err = fs.msgForSeq(fs.lastSeq(), smv)
  6157  	} else {
  6158  		sm, err = fs.loadLast(subject, smv)
  6159  	}
  6160  	if sm == nil || (err != nil && err != ErrStoreClosed) {
  6161  		err = ErrStoreMsgNotFound
  6162  	}
  6163  	return sm, err
  6164  }
  6165  
  6166  func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) {
  6167  	fs.mu.RLock()
  6168  	defer fs.mu.RUnlock()
  6169  
  6170  	if fs.closed {
  6171  		return nil, 0, ErrStoreClosed
  6172  	}
  6173  	if start < fs.state.FirstSeq {
  6174  		start = fs.state.FirstSeq
  6175  	}
  6176  
  6177  	// If start is less than or equal to beginning of our stream, meaning our first call,
  6178  	// let's check the psim to see if we can skip ahead.
  6179  	if start <= fs.state.FirstSeq {
  6180  		var ss SimpleState
  6181  		fs.numFilteredPending(filter, &ss)
  6182  		if ss.First > start {
  6183  			start = ss.First
  6184  		}
  6185  	}
  6186  
  6187  	if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 {
  6188  		for i := bi; i < len(fs.blks); i++ {
  6189  			mb := fs.blks[i]
  6190  			if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil {
  6191  				if expireOk {
  6192  					mb.tryForceExpireCache()
  6193  				}
  6194  				return sm, sm.seq, nil
  6195  			} else if err != ErrStoreMsgNotFound {
  6196  				return nil, 0, err
  6197  			} else if expireOk {
  6198  				mb.tryForceExpireCache()
  6199  			}
  6200  		}
  6201  	}
  6202  
  6203  	return nil, fs.state.LastSeq, ErrStoreEOF
  6204  }
  6205  
  6206  // Type returns the type of the underlying store.
  6207  func (fs *fileStore) Type() StorageType {
  6208  	return FileStorage
  6209  }
  6210  
  6211  // Returns number of subjects in this store.
  6212  // Lock should be held.
  6213  func (fs *fileStore) numSubjects() int {
  6214  	return fs.psim.Size()
  6215  }
  6216  
  6217  // numConsumers uses new lock.
  6218  func (fs *fileStore) numConsumers() int {
  6219  	fs.cmu.RLock()
  6220  	defer fs.cmu.RUnlock()
  6221  	return len(fs.cfs)
  6222  }
  6223  
  6224  // FastState will fill in state with only the following.
  6225  // Msgs, Bytes, First and Last Sequence and Time and NumDeleted.
  6226  func (fs *fileStore) FastState(state *StreamState) {
  6227  	fs.mu.RLock()
  6228  	state.Msgs = fs.state.Msgs
  6229  	state.Bytes = fs.state.Bytes
  6230  	state.FirstSeq = fs.state.FirstSeq
  6231  	state.FirstTime = fs.state.FirstTime
  6232  	state.LastSeq = fs.state.LastSeq
  6233  	state.LastTime = fs.state.LastTime
  6234  	if state.LastSeq > state.FirstSeq {
  6235  		state.NumDeleted = int((state.LastSeq - state.FirstSeq + 1) - state.Msgs)
  6236  		if state.NumDeleted < 0 {
  6237  			state.NumDeleted = 0
  6238  		}
  6239  	}
  6240  	state.Consumers = fs.numConsumers()
  6241  	state.NumSubjects = fs.numSubjects()
  6242  	fs.mu.RUnlock()
  6243  }
  6244  
  6245  // State returns the current state of the stream.
  6246  func (fs *fileStore) State() StreamState {
  6247  	fs.mu.RLock()
  6248  	state := fs.state
  6249  	state.Consumers = fs.numConsumers()
  6250  	state.NumSubjects = fs.numSubjects()
  6251  	state.Deleted = nil // make sure.
  6252  
  6253  	if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 {
  6254  		state.Deleted = make([]uint64, 0, numDeleted)
  6255  		cur := fs.state.FirstSeq
  6256  
  6257  		for _, mb := range fs.blks {
  6258  			mb.mu.Lock()
  6259  			fseq := atomic.LoadUint64(&mb.first.seq)
  6260  			// Account for messages missing from the head.
  6261  			if fseq > cur {
  6262  				for seq := cur; seq < fseq; seq++ {
  6263  					state.Deleted = append(state.Deleted, seq)
  6264  				}
  6265  			}
  6266  			cur = atomic.LoadUint64(&mb.last.seq) + 1 // Expected next first.
  6267  
  6268  			mb.dmap.Range(func(seq uint64) bool {
  6269  				if seq < fseq {
  6270  					mb.dmap.Delete(seq)
  6271  				} else {
  6272  					state.Deleted = append(state.Deleted, seq)
  6273  				}
  6274  				return true
  6275  			})
  6276  			mb.mu.Unlock()
  6277  		}
  6278  	}
  6279  	fs.mu.RUnlock()
  6280  
  6281  	state.Lost = fs.lostData()
  6282  
  6283  	// Can not be guaranteed to be sorted.
  6284  	if len(state.Deleted) > 0 {
  6285  		sort.Slice(state.Deleted, func(i, j int) bool {
  6286  			return state.Deleted[i] < state.Deleted[j]
  6287  		})
  6288  		state.NumDeleted = len(state.Deleted)
  6289  	}
  6290  	return state
  6291  }
  6292  
  6293  func (fs *fileStore) Utilization() (total, reported uint64, err error) {
  6294  	fs.mu.RLock()
  6295  	defer fs.mu.RUnlock()
  6296  	for _, mb := range fs.blks {
  6297  		mb.mu.RLock()
  6298  		reported += mb.bytes
  6299  		total += mb.rbytes
  6300  		mb.mu.RUnlock()
  6301  	}
  6302  	return total, reported, nil
  6303  }
  6304  
  6305  func fileStoreMsgSize(subj string, hdr, msg []byte) uint64 {
  6306  	if len(hdr) == 0 {
  6307  		// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + msg + hash(8)
  6308  		return uint64(22 + len(subj) + len(msg) + 8)
  6309  	}
  6310  	// length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + hdr_len(4) + hdr + msg + hash(8)
  6311  	return uint64(22 + len(subj) + 4 + len(hdr) + len(msg) + 8)
  6312  }
  6313  
  6314  func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 {
  6315  	return uint64(emptyRecordLen + slen + 4 + maxPayload)
  6316  }
  6317  
  6318  // Determine time since any last activity, read/load, write or remove.
  6319  func (mb *msgBlock) sinceLastActivity() time.Duration {
  6320  	if mb.closed {
  6321  		return 0
  6322  	}
  6323  	last := mb.lwts
  6324  	if mb.lrts > last {
  6325  		last = mb.lrts
  6326  	}
  6327  	if mb.llts > last {
  6328  		last = mb.llts
  6329  	}
  6330  	return time.Since(time.Unix(0, last).UTC())
  6331  }
  6332  
  6333  // Determine time since last write or remove of a message.
  6334  // Read lock should be held.
  6335  func (mb *msgBlock) sinceLastWriteActivity() time.Duration {
  6336  	if mb.closed {
  6337  		return 0
  6338  	}
  6339  	last := mb.lwts
  6340  	if mb.lrts > last {
  6341  		last = mb.lrts
  6342  	}
  6343  	return time.Since(time.Unix(0, last).UTC())
  6344  }
  6345  
  6346  func checkNewHeader(hdr []byte) error {
  6347  	if hdr == nil || len(hdr) < 2 || hdr[0] != magic ||
  6348  		(hdr[1] != version && hdr[1] != newVersion) {
  6349  		return errCorruptState
  6350  	}
  6351  	return nil
  6352  }
  6353  
  6354  // readIndexInfo will read in the index information for the message block.
  6355  func (mb *msgBlock) readIndexInfo() error {
  6356  	ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index))
  6357  	buf, err := os.ReadFile(ifn)
  6358  	if err != nil {
  6359  		return err
  6360  	}
  6361  
  6362  	// Set if first time.
  6363  	if mb.liwsz == 0 {
  6364  		mb.liwsz = int64(len(buf))
  6365  	}
  6366  
  6367  	// Decrypt if needed.
  6368  	if mb.aek != nil {
  6369  		buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil)
  6370  		if err != nil {
  6371  			return err
  6372  		}
  6373  	}
  6374  
  6375  	if err := checkNewHeader(buf); err != nil {
  6376  		defer os.Remove(ifn)
  6377  		return fmt.Errorf("bad index file")
  6378  	}
  6379  
  6380  	bi := hdrLen
  6381  
  6382  	// Helpers, will set i to -1 on error.
  6383  	readSeq := func() uint64 {
  6384  		if bi < 0 {
  6385  			return 0
  6386  		}
  6387  		seq, n := binary.Uvarint(buf[bi:])
  6388  		if n <= 0 {
  6389  			bi = -1
  6390  			return 0
  6391  		}
  6392  		bi += n
  6393  		return seq &^ ebit
  6394  	}
  6395  	readCount := readSeq
  6396  	readTimeStamp := func() int64 {
  6397  		if bi < 0 {
  6398  			return 0
  6399  		}
  6400  		ts, n := binary.Varint(buf[bi:])
  6401  		if n <= 0 {
  6402  			bi = -1
  6403  			return -1
  6404  		}
  6405  		bi += n
  6406  		return ts
  6407  	}
  6408  	mb.msgs = readCount()
  6409  	mb.bytes = readCount()
  6410  	atomic.StoreUint64(&mb.first.seq, readSeq())
  6411  	mb.first.ts = readTimeStamp()
  6412  	atomic.StoreUint64(&mb.last.seq, readSeq())
  6413  	mb.last.ts = readTimeStamp()
  6414  	dmapLen := readCount()
  6415  
  6416  	// Check if this is a short write index file.
  6417  	if bi < 0 || bi+checksumSize > len(buf) {
  6418  		os.Remove(ifn)
  6419  		return fmt.Errorf("short index file")
  6420  	}
  6421  
  6422  	// Check for consistency if accounting. If something is off bail and we will rebuild.
  6423  	if mb.msgs != (atomic.LoadUint64(&mb.last.seq)-atomic.LoadUint64(&mb.first.seq)+1)-dmapLen {
  6424  		os.Remove(ifn)
  6425  		return fmt.Errorf("accounting inconsistent")
  6426  	}
  6427  
  6428  	// Checksum
  6429  	copy(mb.lchk[0:], buf[bi:bi+checksumSize])
  6430  	bi += checksumSize
  6431  
  6432  	// Now check for presence of a delete map
  6433  	if dmapLen > 0 {
  6434  		// New version is encoded avl seqset.
  6435  		if buf[1] == newVersion {
  6436  			dmap, _, err := avl.Decode(buf[bi:])
  6437  			if err != nil {
  6438  				return fmt.Errorf("could not decode avl dmap: %v", err)
  6439  			}
  6440  			mb.dmap = *dmap
  6441  		} else {
  6442  			// This is the old version.
  6443  			for i, fseq := 0, atomic.LoadUint64(&mb.first.seq); i < int(dmapLen); i++ {
  6444  				seq := readSeq()
  6445  				if seq == 0 {
  6446  					break
  6447  				}
  6448  				mb.dmap.Insert(seq + fseq)
  6449  			}
  6450  		}
  6451  	}
  6452  
  6453  	return nil
  6454  }
  6455  
  6456  // Will return total number of cache loads.
  6457  func (fs *fileStore) cacheLoads() uint64 {
  6458  	var tl uint64
  6459  	fs.mu.RLock()
  6460  	for _, mb := range fs.blks {
  6461  		tl += mb.cloads
  6462  	}
  6463  	fs.mu.RUnlock()
  6464  	return tl
  6465  }
  6466  
  6467  // Will return total number of cached bytes.
  6468  func (fs *fileStore) cacheSize() uint64 {
  6469  	var sz uint64
  6470  	fs.mu.RLock()
  6471  	for _, mb := range fs.blks {
  6472  		mb.mu.RLock()
  6473  		if mb.cache != nil {
  6474  			sz += uint64(len(mb.cache.buf))
  6475  		}
  6476  		mb.mu.RUnlock()
  6477  	}
  6478  	fs.mu.RUnlock()
  6479  	return sz
  6480  }
  6481  
  6482  // Will return total number of dmapEntries for all msg blocks.
  6483  func (fs *fileStore) dmapEntries() int {
  6484  	var total int
  6485  	fs.mu.RLock()
  6486  	for _, mb := range fs.blks {
  6487  		total += mb.dmap.Size()
  6488  	}
  6489  	fs.mu.RUnlock()
  6490  	return total
  6491  }
  6492  
  6493  // Fixed helper for iterating.
  6494  func subjectsEqual(a, b string) bool {
  6495  	return a == b
  6496  }
  6497  
  6498  func subjectsAll(a, b string) bool {
  6499  	return true
  6500  }
  6501  
  6502  func compareFn(subject string) func(string, string) bool {
  6503  	if subject == _EMPTY_ || subject == fwcs {
  6504  		return subjectsAll
  6505  	}
  6506  	if subjectHasWildcard(subject) {
  6507  		return subjectIsSubsetMatch
  6508  	}
  6509  	return subjectsEqual
  6510  }
  6511  
  6512  // PurgeEx will remove messages based on subject filters, sequence and number of messages to keep.
  6513  // Will return the number of purged messages.
  6514  func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) {
  6515  	if subject == _EMPTY_ || subject == fwcs {
  6516  		if keep == 0 && sequence == 0 {
  6517  			return fs.Purge()
  6518  		}
  6519  		if sequence > 1 {
  6520  			return fs.Compact(sequence)
  6521  		}
  6522  	}
  6523  
  6524  	eq, wc := compareFn(subject), subjectHasWildcard(subject)
  6525  	var firstSeqNeedsUpdate bool
  6526  	var bytes uint64
  6527  
  6528  	// If we have a "keep" designation need to get full filtered state so we know how many to purge.
  6529  	var maxp uint64
  6530  	if keep > 0 {
  6531  		ss := fs.FilteredState(1, subject)
  6532  		if keep >= ss.Msgs {
  6533  			return 0, nil
  6534  		}
  6535  		maxp = ss.Msgs - keep
  6536  	}
  6537  
  6538  	var smv StoreMsg
  6539  
  6540  	fs.mu.Lock()
  6541  	// We may remove blocks as we purge, so don't range directly on fs.blks
  6542  	// otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528)
  6543  	for i := 0; i < len(fs.blks); i++ {
  6544  		mb := fs.blks[i]
  6545  		mb.mu.Lock()
  6546  
  6547  		// If we do not have our fss, try to expire the cache if we have no items in this block.
  6548  		shouldExpire := mb.fssNotLoaded()
  6549  
  6550  		t, f, l := mb.filteredPendingLocked(subject, wc, atomic.LoadUint64(&mb.first.seq))
  6551  		if t == 0 {
  6552  			// Expire if we were responsible for loading.
  6553  			if shouldExpire {
  6554  				// Expire this cache before moving on.
  6555  				mb.tryForceExpireCacheLocked()
  6556  			}
  6557  			mb.mu.Unlock()
  6558  			continue
  6559  		}
  6560  
  6561  		if sequence > 1 && sequence <= l {
  6562  			l = sequence - 1
  6563  		}
  6564  
  6565  		if mb.cacheNotLoaded() {
  6566  			mb.loadMsgsWithLock()
  6567  			shouldExpire = true
  6568  		}
  6569  
  6570  		for seq := f; seq <= l; seq++ {
  6571  			if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) {
  6572  				rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  6573  				// Do fast in place remove.
  6574  				// Stats
  6575  				if mb.msgs > 0 {
  6576  					// Msgs
  6577  					fs.state.Msgs--
  6578  					mb.msgs--
  6579  					// Bytes, make sure to not go negative.
  6580  					if rl > fs.state.Bytes {
  6581  						rl = fs.state.Bytes
  6582  					}
  6583  					if rl > mb.bytes {
  6584  						rl = mb.bytes
  6585  					}
  6586  					fs.state.Bytes -= rl
  6587  					mb.bytes -= rl
  6588  					// Totals
  6589  					purged++
  6590  					bytes += rl
  6591  				}
  6592  				// FSS updates.
  6593  				mb.removeSeqPerSubject(sm.subj, seq)
  6594  				fs.removePerSubject(sm.subj)
  6595  
  6596  				// Check for first message.
  6597  				if seq == atomic.LoadUint64(&mb.first.seq) {
  6598  					mb.selectNextFirst()
  6599  					if mb.isEmpty() {
  6600  						fs.removeMsgBlock(mb)
  6601  						i--
  6602  						// keep flag set, if set previously
  6603  						firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq
  6604  					} else if seq == fs.state.FirstSeq {
  6605  						fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one.
  6606  						fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC()
  6607  					}
  6608  				} else {
  6609  					// Out of order delete.
  6610  					mb.dmap.Insert(seq)
  6611  				}
  6612  
  6613  				if maxp > 0 && purged >= maxp {
  6614  					break
  6615  				}
  6616  			}
  6617  		}
  6618  		// Expire if we were responsible for loading.
  6619  		if shouldExpire {
  6620  			// Expire this cache before moving on.
  6621  			mb.tryForceExpireCacheLocked()
  6622  		}
  6623  		mb.mu.Unlock()
  6624  
  6625  		// Check if we should break out of top level too.
  6626  		if maxp > 0 && purged >= maxp {
  6627  			break
  6628  		}
  6629  	}
  6630  	if firstSeqNeedsUpdate {
  6631  		fs.selectNextFirst()
  6632  	}
  6633  
  6634  	fs.dirty++
  6635  	cb := fs.scb
  6636  	fs.mu.Unlock()
  6637  
  6638  	fs.kickFlushStateLoop()
  6639  
  6640  	if cb != nil {
  6641  		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
  6642  	}
  6643  
  6644  	return purged, nil
  6645  }
  6646  
  6647  // Purge will remove all messages from this store.
  6648  // Will return the number of purged messages.
  6649  func (fs *fileStore) Purge() (uint64, error) {
  6650  	return fs.purge(0)
  6651  }
  6652  
  6653  func (fs *fileStore) purge(fseq uint64) (uint64, error) {
  6654  	fs.mu.Lock()
  6655  	if fs.closed {
  6656  		fs.mu.Unlock()
  6657  		return 0, ErrStoreClosed
  6658  	}
  6659  
  6660  	purged := fs.state.Msgs
  6661  	rbytes := int64(fs.state.Bytes)
  6662  
  6663  	fs.state.FirstSeq = fs.state.LastSeq + 1
  6664  	fs.state.FirstTime = time.Time{}
  6665  
  6666  	fs.state.Bytes = 0
  6667  	fs.state.Msgs = 0
  6668  
  6669  	for _, mb := range fs.blks {
  6670  		mb.dirtyClose()
  6671  	}
  6672  
  6673  	fs.blks = nil
  6674  	fs.lmb = nil
  6675  	fs.bim = make(map[uint32]*msgBlock)
  6676  	// Clear any per subject tracking.
  6677  	fs.psim, fs.tsl = fs.psim.Empty(), 0
  6678  	// Mark dirty
  6679  	fs.dirty++
  6680  
  6681  	// Move the msgs directory out of the way, will delete out of band.
  6682  	// FIXME(dlc) - These can error and we need to change api above to propagate?
  6683  	mdir := filepath.Join(fs.fcfg.StoreDir, msgDir)
  6684  	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
  6685  	// If purge directory still exists then we need to wait
  6686  	// in place and remove since rename would fail.
  6687  	if _, err := os.Stat(pdir); err == nil {
  6688  		<-dios
  6689  		os.RemoveAll(pdir)
  6690  		dios <- struct{}{}
  6691  	}
  6692  
  6693  	<-dios
  6694  	os.Rename(mdir, pdir)
  6695  	dios <- struct{}{}
  6696  
  6697  	go func() {
  6698  		<-dios
  6699  		os.RemoveAll(pdir)
  6700  		dios <- struct{}{}
  6701  	}()
  6702  
  6703  	// Create new one.
  6704  	<-dios
  6705  	os.MkdirAll(mdir, defaultDirPerms)
  6706  	dios <- struct{}{}
  6707  
  6708  	// Make sure we have a lmb to write to.
  6709  	if _, err := fs.newMsgBlockForWrite(); err != nil {
  6710  		fs.mu.Unlock()
  6711  		return purged, err
  6712  	}
  6713  
  6714  	// Check if we need to set the first seq to a new number.
  6715  	if fseq > fs.state.FirstSeq {
  6716  		fs.state.FirstSeq = fseq
  6717  		fs.state.LastSeq = fseq - 1
  6718  	}
  6719  
  6720  	lmb := fs.lmb
  6721  	atomic.StoreUint64(&lmb.first.seq, fs.state.FirstSeq)
  6722  	atomic.StoreUint64(&lmb.last.seq, fs.state.LastSeq)
  6723  	lmb.last.ts = fs.state.LastTime.UnixNano()
  6724  
  6725  	if lseq := atomic.LoadUint64(&lmb.last.seq); lseq > 1 {
  6726  		// Leave a tombstone so we can remember our starting sequence in case
  6727  		// full state becomes corrupted.
  6728  		lmb.writeTombstone(lseq, lmb.last.ts)
  6729  	}
  6730  
  6731  	cb := fs.scb
  6732  	fs.mu.Unlock()
  6733  
  6734  	if cb != nil {
  6735  		cb(-int64(purged), -rbytes, 0, _EMPTY_)
  6736  	}
  6737  
  6738  	return purged, nil
  6739  }
  6740  
  6741  // Compact will remove all messages from this store up to
  6742  // but not including the seq parameter.
  6743  // Will return the number of purged messages.
  6744  func (fs *fileStore) Compact(seq uint64) (uint64, error) {
  6745  	if seq == 0 {
  6746  		return fs.purge(seq)
  6747  	}
  6748  
  6749  	var purged, bytes uint64
  6750  
  6751  	fs.mu.Lock()
  6752  	// Same as purge all.
  6753  	if lseq := fs.state.LastSeq; seq > lseq {
  6754  		fs.mu.Unlock()
  6755  		return fs.purge(seq)
  6756  	}
  6757  	// We have to delete interior messages.
  6758  	smb := fs.selectMsgBlock(seq)
  6759  	if smb == nil {
  6760  		fs.mu.Unlock()
  6761  		return 0, nil
  6762  	}
  6763  
  6764  	// All msgblocks up to this one can be thrown away.
  6765  	var deleted int
  6766  	for _, mb := range fs.blks {
  6767  		if mb == smb {
  6768  			break
  6769  		}
  6770  		mb.mu.Lock()
  6771  		purged += mb.msgs
  6772  		bytes += mb.bytes
  6773  		// Make sure we do subject cleanup as well.
  6774  		mb.ensurePerSubjectInfoLoaded()
  6775  		for subj, ss := range mb.fss {
  6776  			for i := uint64(0); i < ss.Msgs; i++ {
  6777  				fs.removePerSubject(subj)
  6778  			}
  6779  		}
  6780  		// Now close.
  6781  		mb.dirtyCloseWithRemove(true)
  6782  		mb.mu.Unlock()
  6783  		deleted++
  6784  	}
  6785  
  6786  	var smv StoreMsg
  6787  	var err error
  6788  	var isEmpty bool
  6789  
  6790  	smb.mu.Lock()
  6791  	if atomic.LoadUint64(&smb.first.seq) == seq {
  6792  		fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq)
  6793  		fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC()
  6794  		goto SKIP
  6795  	}
  6796  
  6797  	// Make sure we have the messages loaded.
  6798  	if smb.cacheNotLoaded() {
  6799  		if err = smb.loadMsgsWithLock(); err != nil {
  6800  			goto SKIP
  6801  		}
  6802  	}
  6803  	for mseq := atomic.LoadUint64(&smb.first.seq); mseq < seq; mseq++ {
  6804  		sm, err := smb.cacheLookup(mseq, &smv)
  6805  		if err == errDeletedMsg {
  6806  			// Update dmap.
  6807  			if !smb.dmap.IsEmpty() {
  6808  				smb.dmap.Delete(seq)
  6809  			}
  6810  		} else if sm != nil {
  6811  			sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)
  6812  			if smb.msgs > 0 {
  6813  				smb.msgs--
  6814  				if sz > smb.bytes {
  6815  					sz = smb.bytes
  6816  				}
  6817  				smb.bytes -= sz
  6818  				bytes += sz
  6819  				purged++
  6820  			}
  6821  			// Update fss
  6822  			smb.removeSeqPerSubject(sm.subj, mseq)
  6823  			fs.removePerSubject(sm.subj)
  6824  		}
  6825  	}
  6826  
  6827  	// Check if empty after processing, could happen if tail of messages are all deleted.
  6828  	isEmpty = smb.msgs == 0
  6829  	if isEmpty {
  6830  		smb.dirtyCloseWithRemove(true)
  6831  		// Update fs first here as well.
  6832  		fs.state.FirstSeq = atomic.LoadUint64(&smb.last.seq) + 1
  6833  		fs.state.FirstTime = time.Time{}
  6834  		deleted++
  6835  	} else {
  6836  		// Make sure to sync changes.
  6837  		smb.needSync = true
  6838  		// Update fs first seq and time.
  6839  		atomic.StoreUint64(&smb.first.seq, seq-1) // Just for start condition for selectNextFirst.
  6840  		smb.selectNextFirst()
  6841  
  6842  		fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq)
  6843  		fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC()
  6844  
  6845  		// Check if we should reclaim the head space from this block.
  6846  		// This will be optimistic only, so don't continue if we encounter any errors here.
  6847  		if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes {
  6848  			var moff uint32
  6849  			moff, _, _, err = smb.slotInfo(int(atomic.LoadUint64(&smb.first.seq) - smb.cache.fseq))
  6850  			if err != nil || moff >= uint32(len(smb.cache.buf)) {
  6851  				goto SKIP
  6852  			}
  6853  			buf := smb.cache.buf[moff:]
  6854  			// Don't reuse, copy to new recycled buf.
  6855  			nbuf := getMsgBlockBuf(len(buf))
  6856  			nbuf = append(nbuf, buf...)
  6857  			smb.closeFDsLockedNoCheck()
  6858  			// Check for encryption.
  6859  			if smb.bek != nil && len(nbuf) > 0 {
  6860  				// Recreate to reset counter.
  6861  				bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce)
  6862  				if err != nil {
  6863  					goto SKIP
  6864  				}
  6865  				// For future writes make sure to set smb.bek to keep counter correct.
  6866  				smb.bek = bek
  6867  				smb.bek.XORKeyStream(nbuf, nbuf)
  6868  			}
  6869  			// Recompress if necessary (smb.cmp contains the algorithm used when
  6870  			// the block was loaded from disk, or defaults to NoCompression if not)
  6871  			if nbuf, err = smb.cmp.Compress(nbuf); err != nil {
  6872  				goto SKIP
  6873  			}
  6874  			<-dios
  6875  			err = os.WriteFile(smb.mfn, nbuf, defaultFilePerms)
  6876  			dios <- struct{}{}
  6877  			if err != nil {
  6878  				goto SKIP
  6879  			}
  6880  			// Make sure to remove fss state.
  6881  			smb.fss = nil
  6882  			smb.clearCacheAndOffset()
  6883  			smb.rbytes = uint64(len(nbuf))
  6884  		}
  6885  	}
  6886  
  6887  SKIP:
  6888  	smb.mu.Unlock()
  6889  
  6890  	if deleted > 0 {
  6891  		// Update block map.
  6892  		if fs.bim != nil {
  6893  			for _, mb := range fs.blks[:deleted] {
  6894  				delete(fs.bim, mb.index)
  6895  			}
  6896  		}
  6897  		// Update blks slice.
  6898  		fs.blks = copyMsgBlocks(fs.blks[deleted:])
  6899  		if lb := len(fs.blks); lb == 0 {
  6900  			fs.lmb = nil
  6901  		} else {
  6902  			fs.lmb = fs.blks[lb-1]
  6903  		}
  6904  	}
  6905  
  6906  	// Update top level accounting.
  6907  	if purged > fs.state.Msgs {
  6908  		purged = fs.state.Msgs
  6909  	}
  6910  	fs.state.Msgs -= purged
  6911  
  6912  	if bytes > fs.state.Bytes {
  6913  		bytes = fs.state.Bytes
  6914  	}
  6915  	fs.state.Bytes -= bytes
  6916  
  6917  	fs.dirty++
  6918  	fs.kickFlushStateLoop()
  6919  
  6920  	cb := fs.scb
  6921  	fs.mu.Unlock()
  6922  
  6923  	if cb != nil && purged > 0 {
  6924  		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
  6925  	}
  6926  
  6927  	return purged, err
  6928  }
  6929  
  6930  // Will completely reset our store.
  6931  func (fs *fileStore) reset() error {
  6932  	fs.mu.Lock()
  6933  	if fs.closed {
  6934  		fs.mu.Unlock()
  6935  		return ErrStoreClosed
  6936  	}
  6937  	if fs.sips > 0 {
  6938  		fs.mu.Unlock()
  6939  		return ErrStoreSnapshotInProgress
  6940  	}
  6941  
  6942  	var purged, bytes uint64
  6943  	cb := fs.scb
  6944  
  6945  	for _, mb := range fs.blks {
  6946  		mb.mu.Lock()
  6947  		purged += mb.msgs
  6948  		bytes += mb.bytes
  6949  		mb.dirtyCloseWithRemove(true)
  6950  		mb.mu.Unlock()
  6951  	}
  6952  
  6953  	// Reset
  6954  	fs.state.FirstSeq = 0
  6955  	fs.state.FirstTime = time.Time{}
  6956  	fs.state.LastSeq = 0
  6957  	fs.state.LastTime = time.Now().UTC()
  6958  	// Update msgs and bytes.
  6959  	fs.state.Msgs = 0
  6960  	fs.state.Bytes = 0
  6961  
  6962  	// Reset blocks.
  6963  	fs.blks, fs.lmb = nil, nil
  6964  
  6965  	// Reset subject mappings.
  6966  	fs.psim, fs.tsl = fs.psim.Empty(), 0
  6967  	fs.bim = make(map[uint32]*msgBlock)
  6968  
  6969  	// If we purged anything, make sure we kick flush state loop.
  6970  	if purged > 0 {
  6971  		fs.dirty++
  6972  		fs.kickFlushStateLoop()
  6973  	}
  6974  
  6975  	fs.mu.Unlock()
  6976  
  6977  	if cb != nil {
  6978  		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
  6979  	}
  6980  
  6981  	return nil
  6982  }
  6983  
  6984  // Truncate will truncate a stream store up to seq. Sequence needs to be valid.
  6985  func (fs *fileStore) Truncate(seq uint64) error {
  6986  	// Check for request to reset.
  6987  	if seq == 0 {
  6988  		return fs.reset()
  6989  	}
  6990  
  6991  	fs.mu.Lock()
  6992  
  6993  	if fs.closed {
  6994  		fs.mu.Unlock()
  6995  		return ErrStoreClosed
  6996  	}
  6997  	if fs.sips > 0 {
  6998  		fs.mu.Unlock()
  6999  		return ErrStoreSnapshotInProgress
  7000  	}
  7001  
  7002  	nlmb := fs.selectMsgBlock(seq)
  7003  	if nlmb == nil {
  7004  		fs.mu.Unlock()
  7005  		return ErrInvalidSequence
  7006  	}
  7007  	lsm, _, _ := nlmb.fetchMsg(seq, nil)
  7008  	if lsm == nil {
  7009  		fs.mu.Unlock()
  7010  		return ErrInvalidSequence
  7011  	}
  7012  
  7013  	// Set lmb to nlmb and make sure writeable.
  7014  	fs.lmb = nlmb
  7015  	if err := nlmb.enableForWriting(fs.fip); err != nil {
  7016  		return err
  7017  	}
  7018  
  7019  	var purged, bytes uint64
  7020  
  7021  	// Truncate our new last message block.
  7022  	nmsgs, nbytes, err := nlmb.truncate(lsm)
  7023  	if err != nil {
  7024  		fs.mu.Unlock()
  7025  		return fmt.Errorf("nlmb.truncate: %w", err)
  7026  	}
  7027  	// Account for the truncated msgs and bytes.
  7028  	purged += nmsgs
  7029  	bytes += nbytes
  7030  
  7031  	// Remove any left over msg blocks.
  7032  	getLastMsgBlock := func() *msgBlock { return fs.blks[len(fs.blks)-1] }
  7033  	for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() {
  7034  		mb.mu.Lock()
  7035  		purged += mb.msgs
  7036  		bytes += mb.bytes
  7037  		fs.removeMsgBlock(mb)
  7038  		mb.mu.Unlock()
  7039  	}
  7040  
  7041  	// Reset last.
  7042  	fs.state.LastSeq = lsm.seq
  7043  	fs.state.LastTime = time.Unix(0, lsm.ts).UTC()
  7044  	// Update msgs and bytes.
  7045  	if purged > fs.state.Msgs {
  7046  		purged = fs.state.Msgs
  7047  	}
  7048  	fs.state.Msgs -= purged
  7049  	if bytes > fs.state.Bytes {
  7050  		bytes = fs.state.Bytes
  7051  	}
  7052  	fs.state.Bytes -= bytes
  7053  
  7054  	// Reset our subject lookup info.
  7055  	fs.resetGlobalPerSubjectInfo()
  7056  
  7057  	fs.dirty++
  7058  	fs.kickFlushStateLoop()
  7059  
  7060  	cb := fs.scb
  7061  	fs.mu.Unlock()
  7062  
  7063  	if cb != nil {
  7064  		cb(-int64(purged), -int64(bytes), 0, _EMPTY_)
  7065  	}
  7066  
  7067  	return nil
  7068  }
  7069  
  7070  func (fs *fileStore) lastSeq() uint64 {
  7071  	fs.mu.RLock()
  7072  	seq := fs.state.LastSeq
  7073  	fs.mu.RUnlock()
  7074  	return seq
  7075  }
  7076  
  7077  // Returns number of msg blks.
  7078  func (fs *fileStore) numMsgBlocks() int {
  7079  	fs.mu.RLock()
  7080  	defer fs.mu.RUnlock()
  7081  	return len(fs.blks)
  7082  }
  7083  
  7084  // Will add a new msgBlock.
  7085  // Lock should be held.
  7086  func (fs *fileStore) addMsgBlock(mb *msgBlock) {
  7087  	fs.blks = append(fs.blks, mb)
  7088  	fs.lmb = mb
  7089  	fs.bim[mb.index] = mb
  7090  }
  7091  
  7092  // Remove from our list of blks.
  7093  // Both locks should be held.
  7094  func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) {
  7095  	// Remove from list.
  7096  	for i, omb := range fs.blks {
  7097  		if mb == omb {
  7098  			fs.dirty++
  7099  			blks := append(fs.blks[:i], fs.blks[i+1:]...)
  7100  			fs.blks = copyMsgBlocks(blks)
  7101  			if fs.bim != nil {
  7102  				delete(fs.bim, mb.index)
  7103  			}
  7104  			break
  7105  		}
  7106  	}
  7107  }
  7108  
  7109  // Removes the msgBlock
  7110  // Both locks should be held.
  7111  func (fs *fileStore) removeMsgBlock(mb *msgBlock) {
  7112  	mb.dirtyCloseWithRemove(true)
  7113  	fs.removeMsgBlockFromList(mb)
  7114  	// Check for us being last message block
  7115  	if mb == fs.lmb {
  7116  		lseq, lts := atomic.LoadUint64(&mb.last.seq), mb.last.ts
  7117  		// Creating a new message write block requires that the lmb lock is not held.
  7118  		mb.mu.Unlock()
  7119  		// Write the tombstone to remember since this was last block.
  7120  		if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil {
  7121  			lmb.writeTombstone(lseq, lts)
  7122  		}
  7123  		mb.mu.Lock()
  7124  	}
  7125  }
  7126  
  7127  // Called by purge to simply get rid of the cache and close our fds.
  7128  // Lock should not be held.
  7129  func (mb *msgBlock) dirtyClose() {
  7130  	mb.mu.Lock()
  7131  	defer mb.mu.Unlock()
  7132  	mb.dirtyCloseWithRemove(false)
  7133  }
  7134  
  7135  // Should be called with lock held.
  7136  func (mb *msgBlock) dirtyCloseWithRemove(remove bool) {
  7137  	if mb == nil {
  7138  		return
  7139  	}
  7140  	// Stop cache expiration timer.
  7141  	if mb.ctmr != nil {
  7142  		mb.ctmr.Stop()
  7143  		mb.ctmr = nil
  7144  	}
  7145  	// Clear any tracking by subject.
  7146  	mb.fss = nil
  7147  	// Close cache
  7148  	mb.clearCacheAndOffset()
  7149  	// Quit our loops.
  7150  	if mb.qch != nil {
  7151  		close(mb.qch)
  7152  		mb.qch = nil
  7153  	}
  7154  	if mb.mfd != nil {
  7155  		mb.mfd.Close()
  7156  		mb.mfd = nil
  7157  	}
  7158  	if remove {
  7159  		if mb.mfn != _EMPTY_ {
  7160  			os.Remove(mb.mfn)
  7161  			mb.mfn = _EMPTY_
  7162  		}
  7163  		if mb.kfn != _EMPTY_ {
  7164  			os.Remove(mb.kfn)
  7165  		}
  7166  		// Since we are removing a block kick the state flusher.
  7167  		mb.fs.kickFlushStateLoop()
  7168  	}
  7169  }
  7170  
  7171  // Remove a seq from the fss and select new first.
  7172  // Lock should be held.
  7173  func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) {
  7174  	mb.ensurePerSubjectInfoLoaded()
  7175  	ss := mb.fss[subj]
  7176  	if ss == nil {
  7177  		return
  7178  	}
  7179  
  7180  	if ss.Msgs == 1 {
  7181  		delete(mb.fss, subj)
  7182  		return
  7183  	}
  7184  
  7185  	ss.Msgs--
  7186  
  7187  	// Only one left.
  7188  	if ss.Msgs == 1 {
  7189  		if seq == ss.Last {
  7190  			ss.Last = ss.First
  7191  		} else {
  7192  			ss.First = ss.Last
  7193  		}
  7194  		ss.firstNeedsUpdate = false
  7195  		return
  7196  	}
  7197  
  7198  	// We can lazily calculate the first sequence when needed.
  7199  	ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate
  7200  }
  7201  
  7202  // Will recalulate the first sequence for this subject in this block.
  7203  // Will avoid slower path message lookups and scan the cache directly instead.
  7204  func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) {
  7205  	// Need to make sure messages are loaded.
  7206  	if mb.cacheNotLoaded() {
  7207  		if err := mb.loadMsgsWithLock(); err != nil {
  7208  			return
  7209  		}
  7210  	}
  7211  
  7212  	// Mark first as updated.
  7213  	ss.firstNeedsUpdate = false
  7214  	startSeq++
  7215  
  7216  	startSlot := int(startSeq - mb.cache.fseq)
  7217  	if startSlot >= len(mb.cache.idx) {
  7218  		ss.First = ss.Last
  7219  		return
  7220  	} else if startSlot < 0 {
  7221  		startSlot = 0
  7222  	}
  7223  
  7224  	var le = binary.LittleEndian
  7225  	for slot, fseq := startSlot, atomic.LoadUint64(&mb.first.seq); slot < len(mb.cache.idx); slot++ {
  7226  		bi := mb.cache.idx[slot] &^ hbit
  7227  		if bi == dbit {
  7228  			// delete marker so skip.
  7229  			continue
  7230  		}
  7231  		li := int(bi) - mb.cache.off
  7232  		if li >= len(mb.cache.buf) {
  7233  			ss.First = ss.Last
  7234  			return
  7235  		}
  7236  		buf := mb.cache.buf[li:]
  7237  		hdr := buf[:msgHdrSize]
  7238  		slen := int(le.Uint16(hdr[20:]))
  7239  		if subj == bytesToString(buf[msgHdrSize:msgHdrSize+slen]) {
  7240  			seq := le.Uint64(hdr[4:])
  7241  			if seq < fseq || seq&ebit != 0 || mb.dmap.Exists(seq) {
  7242  				continue
  7243  			}
  7244  			ss.First = seq
  7245  			return
  7246  		}
  7247  	}
  7248  }
  7249  
  7250  // Lock should be held.
  7251  func (fs *fileStore) resetGlobalPerSubjectInfo() {
  7252  	// Clear any global subject state.
  7253  	fs.psim, fs.tsl = fs.psim.Empty(), 0
  7254  	for _, mb := range fs.blks {
  7255  		fs.populateGlobalPerSubjectInfo(mb)
  7256  	}
  7257  }
  7258  
  7259  // Lock should be held.
  7260  func (mb *msgBlock) resetPerSubjectInfo() error {
  7261  	mb.fss = nil
  7262  	return mb.generatePerSubjectInfo()
  7263  }
  7264  
  7265  // generatePerSubjectInfo will generate the per subject info via the raw msg block.
  7266  // Lock should be held.
  7267  func (mb *msgBlock) generatePerSubjectInfo() error {
  7268  	// Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info.
  7269  	if mb.msgs == 0 {
  7270  		return nil
  7271  	}
  7272  
  7273  	if mb.cacheNotLoaded() {
  7274  		if err := mb.loadMsgsWithLock(); err != nil {
  7275  			return err
  7276  		}
  7277  		// indexCacheBuf can produce fss now, so if non-nil we are good.
  7278  		if mb.fss != nil {
  7279  			return nil
  7280  		}
  7281  	}
  7282  
  7283  	// Create new one regardless.
  7284  	mb.fss = make(map[string]*SimpleState)
  7285  
  7286  	var smv StoreMsg
  7287  	fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
  7288  	for seq := fseq; seq <= lseq; seq++ {
  7289  		sm, err := mb.cacheLookup(seq, &smv)
  7290  		if err != nil {
  7291  			// Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state.
  7292  			if err == ErrStoreMsgNotFound || err == errDeletedMsg {
  7293  				continue
  7294  			}
  7295  			if err == errNoCache {
  7296  				return nil
  7297  			}
  7298  			return err
  7299  		}
  7300  		if sm != nil && len(sm.subj) > 0 {
  7301  			if ss := mb.fss[sm.subj]; ss != nil {
  7302  				ss.Msgs++
  7303  				ss.Last = seq
  7304  			} else {
  7305  				mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
  7306  			}
  7307  		}
  7308  	}
  7309  
  7310  	if len(mb.fss) > 0 {
  7311  		// Make sure we run the cache expire timer.
  7312  		mb.llts = time.Now().UnixNano()
  7313  		mb.startCacheExpireTimer()
  7314  	}
  7315  	return nil
  7316  }
  7317  
  7318  // Helper to make sure fss loaded if we are tracking.
  7319  // Lock should be held
  7320  func (mb *msgBlock) ensurePerSubjectInfoLoaded() error {
  7321  	if mb.fss != nil || mb.noTrack {
  7322  		return nil
  7323  	}
  7324  	if mb.msgs == 0 {
  7325  		mb.fss = make(map[string]*SimpleState)
  7326  		return nil
  7327  	}
  7328  	return mb.generatePerSubjectInfo()
  7329  }
  7330  
  7331  // Called on recovery to populate the global psim state.
  7332  // Lock should be held.
  7333  func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) {
  7334  	mb.mu.Lock()
  7335  	defer mb.mu.Unlock()
  7336  
  7337  	if err := mb.ensurePerSubjectInfoLoaded(); err != nil {
  7338  		return
  7339  	}
  7340  
  7341  	// Now populate psim.
  7342  	for subj, ss := range mb.fss {
  7343  		if len(subj) > 0 {
  7344  			bsubj := stringToBytes(subj)
  7345  			if info, ok := fs.psim.Find(bsubj); ok {
  7346  				info.total += ss.Msgs
  7347  				if mb.index > info.lblk {
  7348  					info.lblk = mb.index
  7349  				}
  7350  			} else {
  7351  				fs.psim.Insert(bsubj, psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index})
  7352  				fs.tsl += len(subj)
  7353  			}
  7354  		}
  7355  	}
  7356  }
  7357  
  7358  // Close the message block.
  7359  func (mb *msgBlock) close(sync bool) {
  7360  	if mb == nil {
  7361  		return
  7362  	}
  7363  	mb.mu.Lock()
  7364  	defer mb.mu.Unlock()
  7365  
  7366  	if mb.closed {
  7367  		return
  7368  	}
  7369  
  7370  	// Stop cache expiration timer.
  7371  	if mb.ctmr != nil {
  7372  		mb.ctmr.Stop()
  7373  		mb.ctmr = nil
  7374  	}
  7375  
  7376  	// Clear fss.
  7377  	mb.fss = nil
  7378  
  7379  	// Close cache
  7380  	mb.clearCacheAndOffset()
  7381  	// Quit our loops.
  7382  	if mb.qch != nil {
  7383  		close(mb.qch)
  7384  		mb.qch = nil
  7385  	}
  7386  	if mb.mfd != nil {
  7387  		if sync {
  7388  			mb.mfd.Sync()
  7389  		}
  7390  		mb.mfd.Close()
  7391  	}
  7392  	mb.mfd = nil
  7393  	// Mark as closed.
  7394  	mb.closed = true
  7395  }
  7396  
  7397  func (fs *fileStore) closeAllMsgBlocks(sync bool) {
  7398  	for _, mb := range fs.blks {
  7399  		mb.close(sync)
  7400  	}
  7401  }
  7402  
  7403  func (fs *fileStore) Delete() error {
  7404  	if fs.isClosed() {
  7405  		// Always attempt to remove since we could have been closed beforehand.
  7406  		os.RemoveAll(fs.fcfg.StoreDir)
  7407  		// Since we did remove, if we did have anything remaining make sure to
  7408  		// call into any storage updates that had been registered.
  7409  		fs.mu.Lock()
  7410  		cb, msgs, bytes := fs.scb, int64(fs.state.Msgs), int64(fs.state.Bytes)
  7411  		// Guard against double accounting if called twice.
  7412  		fs.state.Msgs, fs.state.Bytes = 0, 0
  7413  		fs.mu.Unlock()
  7414  		if msgs > 0 && cb != nil {
  7415  			cb(-msgs, -bytes, 0, _EMPTY_)
  7416  		}
  7417  		return ErrStoreClosed
  7418  	}
  7419  
  7420  	pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir)
  7421  	// If purge directory still exists then we need to wait
  7422  	// in place and remove since rename would fail.
  7423  	if _, err := os.Stat(pdir); err == nil {
  7424  		os.RemoveAll(pdir)
  7425  	}
  7426  
  7427  	// Do Purge() since if we have lots of blocks uses a mv/rename.
  7428  	fs.Purge()
  7429  
  7430  	if err := fs.stop(false); err != nil {
  7431  		return err
  7432  	}
  7433  
  7434  	// Make sure we will not try to recover if killed before removal below completes.
  7435  	if err := os.Remove(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)); err != nil {
  7436  		return err
  7437  	}
  7438  	// Now move into different directory with "." prefix.
  7439  	ndir := filepath.Join(filepath.Dir(fs.fcfg.StoreDir), tsep+filepath.Base(fs.fcfg.StoreDir))
  7440  	if err := os.Rename(fs.fcfg.StoreDir, ndir); err != nil {
  7441  		return err
  7442  	}
  7443  	// Do this in separate Go routine in case lots of blocks.
  7444  	// Purge above protects us as does the removal of meta artifacts above.
  7445  	go func() {
  7446  		err := os.RemoveAll(ndir)
  7447  		if err == nil {
  7448  			return
  7449  		}
  7450  		ttl := time.Now().Add(time.Second)
  7451  		for time.Now().Before(ttl) {
  7452  			time.Sleep(10 * time.Millisecond)
  7453  			if err = os.RemoveAll(ndir); err == nil {
  7454  				return
  7455  			}
  7456  		}
  7457  	}()
  7458  
  7459  	return nil
  7460  }
  7461  
  7462  // Lock should be held.
  7463  func (fs *fileStore) setSyncTimer() {
  7464  	if fs.syncTmr != nil {
  7465  		fs.syncTmr.Reset(fs.fcfg.SyncInterval)
  7466  	} else {
  7467  		fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks)
  7468  	}
  7469  }
  7470  
  7471  // Lock should be held.
  7472  func (fs *fileStore) cancelSyncTimer() {
  7473  	if fs.syncTmr != nil {
  7474  		fs.syncTmr.Stop()
  7475  		fs.syncTmr = nil
  7476  	}
  7477  }
  7478  
  7479  const (
  7480  	fullStateMagic   = uint8(11)
  7481  	fullStateVersion = uint8(1)
  7482  )
  7483  
  7484  // This go routine runs and receives kicks to write out our full stream state index.
  7485  // This will get kicked when we create a new block or when we delete a block in general.
  7486  // This is also called during Stop().
  7487  func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) {
  7488  	// Make sure we do not try to write these out too fast.
  7489  	const writeThreshold = time.Minute
  7490  	lastWrite := time.Time{}
  7491  
  7492  	// We will use these to complete the full state write while not doing them too fast.
  7493  	var dt *time.Timer
  7494  	var dtc <-chan time.Time
  7495  
  7496  	defer close(done)
  7497  
  7498  	for {
  7499  		select {
  7500  		case <-fch:
  7501  			if elapsed := time.Since(lastWrite); elapsed > writeThreshold {
  7502  				fs.writeFullState()
  7503  				lastWrite = time.Now()
  7504  				if dt != nil {
  7505  					dt.Stop()
  7506  					dt, dtc = nil, nil
  7507  				}
  7508  			} else if dtc == nil {
  7509  				fireIn := time.Until(lastWrite.Add(writeThreshold))
  7510  				if fireIn < 0 {
  7511  					fireIn = 100 * time.Millisecond
  7512  				}
  7513  				dt = time.NewTimer(fireIn)
  7514  				dtc = dt.C
  7515  			}
  7516  		case <-dtc:
  7517  			fs.writeFullState()
  7518  			lastWrite = time.Now()
  7519  			dt, dtc = nil, nil
  7520  		case <-qch:
  7521  			return
  7522  		}
  7523  	}
  7524  }
  7525  
  7526  // Kick the flusher.
  7527  func (fs *fileStore) kickFlushStateLoop() {
  7528  	kickFlusher(fs.fch)
  7529  }
  7530  
  7531  // Helper since unixnano of zero time undefined.
  7532  func timestampNormalized(t time.Time) int64 {
  7533  	if t.IsZero() {
  7534  		return 0
  7535  	}
  7536  	return t.UnixNano()
  7537  }
  7538  
  7539  // This will write the full binary state for the stream.
  7540  // This plus everything new since last hash will be the total recovered state.
  7541  // This state dump will have the following.
  7542  // 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp)
  7543  // 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present.
  7544  // 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset).
  7545  // 4. Last block index and hash of record inclusive to this stream state.
  7546  func (fs *fileStore) writeFullState() error {
  7547  	fs.mu.Lock()
  7548  	if fs.closed || fs.dirty == 0 {
  7549  		fs.mu.Unlock()
  7550  		return nil
  7551  	}
  7552  
  7553  	// We track this through subsequent runs to get an avg per blk used for subsequent runs.
  7554  	avgDmapLen := fs.adml
  7555  	// If first time through could be 0
  7556  	if avgDmapLen == 0 && ((fs.state.LastSeq-fs.state.FirstSeq+1)-fs.state.Msgs) > 0 {
  7557  		avgDmapLen = 1024
  7558  	}
  7559  
  7560  	// For calculating size.
  7561  	numSubjects := fs.psim.Size()
  7562  
  7563  	// Calculate and estimate of the uper bound on the  size to avoid multiple allocations.
  7564  	sz := 2 + // Magic and Version
  7565  		(binary.MaxVarintLen64 * 6) + // FS data
  7566  		binary.MaxVarintLen64 + fs.tsl + // NumSubjects + total subject length
  7567  		numSubjects*(binary.MaxVarintLen64*4) + // psi record
  7568  		binary.MaxVarintLen64 + // Num blocks.
  7569  		len(fs.blks)*((binary.MaxVarintLen64*7)+avgDmapLen) + // msg blocks, avgDmapLen is est for dmaps
  7570  		binary.MaxVarintLen64 + 8 + 8 // last index + record checksum + full state checksum
  7571  
  7572  	// Do 4k on stack if possible.
  7573  	const ssz = 4 * 1024
  7574  	var buf []byte
  7575  
  7576  	if sz <= ssz {
  7577  		var _buf [ssz]byte
  7578  		buf, sz = _buf[0:2:ssz], ssz
  7579  	} else {
  7580  		buf = make([]byte, hdrLen, sz)
  7581  	}
  7582  
  7583  	buf[0], buf[1] = fullStateMagic, fullStateVersion
  7584  	buf = binary.AppendUvarint(buf, fs.state.Msgs)
  7585  	buf = binary.AppendUvarint(buf, fs.state.Bytes)
  7586  	buf = binary.AppendUvarint(buf, fs.state.FirstSeq)
  7587  	buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime))
  7588  	buf = binary.AppendUvarint(buf, fs.state.LastSeq)
  7589  	buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime))
  7590  
  7591  	// Do per subject information map if applicable.
  7592  	buf = binary.AppendUvarint(buf, uint64(numSubjects))
  7593  	if numSubjects > 0 {
  7594  		fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) {
  7595  			buf = binary.AppendUvarint(buf, uint64(len(subj)))
  7596  			buf = append(buf, subj...)
  7597  			buf = binary.AppendUvarint(buf, psi.total)
  7598  			buf = binary.AppendUvarint(buf, uint64(psi.fblk))
  7599  			if psi.total > 1 {
  7600  				buf = binary.AppendUvarint(buf, uint64(psi.lblk))
  7601  			}
  7602  		})
  7603  	}
  7604  
  7605  	// Now walk all blocks and write out first and last and optional dmap encoding.
  7606  	var lbi uint32
  7607  	var lchk [8]byte
  7608  
  7609  	nb := len(fs.blks)
  7610  	buf = binary.AppendUvarint(buf, uint64(nb))
  7611  
  7612  	// Use basetime to save some space.
  7613  	baseTime := timestampNormalized(fs.state.FirstTime)
  7614  	var scratch [8 * 1024]byte
  7615  
  7616  	// Track the state as represented by the mbs.
  7617  	var mstate StreamState
  7618  
  7619  	var dmapTotalLen int
  7620  	for _, mb := range fs.blks {
  7621  		mb.mu.RLock()
  7622  		buf = binary.AppendUvarint(buf, uint64(mb.index))
  7623  		buf = binary.AppendUvarint(buf, mb.bytes)
  7624  		buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.first.seq))
  7625  		buf = binary.AppendVarint(buf, mb.first.ts-baseTime)
  7626  		buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.last.seq))
  7627  		buf = binary.AppendVarint(buf, mb.last.ts-baseTime)
  7628  
  7629  		numDeleted := mb.dmap.Size()
  7630  		buf = binary.AppendUvarint(buf, uint64(numDeleted))
  7631  		if numDeleted > 0 {
  7632  			dmap, _ := mb.dmap.Encode(scratch[:0])
  7633  			dmapTotalLen += len(dmap)
  7634  			buf = append(buf, dmap...)
  7635  		}
  7636  		// If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index.
  7637  		// We use this to quickly open this file on recovery.
  7638  		if mb == fs.lmb {
  7639  			lbi = mb.index
  7640  			mb.ensureLastChecksumLoaded()
  7641  			copy(lchk[0:], mb.lchk[:])
  7642  		}
  7643  		updateTrackingState(&mstate, mb)
  7644  		mb.mu.RUnlock()
  7645  	}
  7646  	if dmapTotalLen > 0 {
  7647  		fs.adml = dmapTotalLen / len(fs.blks)
  7648  	}
  7649  
  7650  	// Place block index and hash onto the end.
  7651  	buf = binary.AppendUvarint(buf, uint64(lbi))
  7652  	buf = append(buf, lchk[:]...)
  7653  
  7654  	// Encrypt if needed.
  7655  	if fs.prf != nil {
  7656  		if err := fs.setupAEK(); err != nil {
  7657  			fs.mu.Unlock()
  7658  			return err
  7659  		}
  7660  		nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead())
  7661  		rand.Read(nonce)
  7662  		buf = fs.aek.Seal(nonce, nonce, buf, nil)
  7663  	}
  7664  
  7665  	fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
  7666  
  7667  	fs.hh.Reset()
  7668  	fs.hh.Write(buf)
  7669  	buf = fs.hh.Sum(buf)
  7670  
  7671  	// Snapshot prior dirty count.
  7672  	priorDirty := fs.dirty
  7673  
  7674  	// Check tracking state.
  7675  	statesEqual := trackingStatesEqual(&fs.state, &mstate)
  7676  	// Release lock.
  7677  	fs.mu.Unlock()
  7678  
  7679  	// Check consistency here.
  7680  	if !statesEqual {
  7681  		fs.warn("Stream state encountered internal inconsistency on write")
  7682  		// Rebuild our fs state from the mb state.
  7683  		fs.rebuildState(nil)
  7684  		// Make sure to reprocess.
  7685  		fs.kickFlushStateLoop()
  7686  		return errCorruptState
  7687  	}
  7688  
  7689  	if cap(buf) > sz {
  7690  		fs.debug("WriteFullState reallocated from %d to %d", sz, cap(buf))
  7691  	}
  7692  
  7693  	// Write to a tmp file and rename.
  7694  	const tmpPre = streamStreamStateFile + tsep
  7695  	f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre)
  7696  	if err != nil {
  7697  		return err
  7698  	}
  7699  	tmpName := f.Name()
  7700  	defer os.Remove(tmpName)
  7701  	if _, err = f.Write(buf); err == nil && fs.fcfg.SyncAlways {
  7702  		f.Sync()
  7703  	}
  7704  	f.Close()
  7705  	if err != nil {
  7706  		return err
  7707  	}
  7708  
  7709  	// Rename into position under our lock, clear prior dirty pending on success.
  7710  	fs.mu.Lock()
  7711  	if !fs.closed {
  7712  		if err := os.Rename(tmpName, fn); err != nil {
  7713  			fs.mu.Unlock()
  7714  			return err
  7715  		}
  7716  		fs.dirty -= priorDirty
  7717  	}
  7718  	fs.mu.Unlock()
  7719  
  7720  	return nil
  7721  }
  7722  
  7723  // Stop the current filestore.
  7724  func (fs *fileStore) Stop() error {
  7725  	return fs.stop(true)
  7726  }
  7727  
  7728  // Stop the current filestore.
  7729  func (fs *fileStore) stop(writeState bool) error {
  7730  	fs.mu.Lock()
  7731  	if fs.closed || fs.closing {
  7732  		fs.mu.Unlock()
  7733  		return ErrStoreClosed
  7734  	}
  7735  
  7736  	// Mark as closing. Do before releasing the lock to writeFullState
  7737  	// so we don't end up with this function running more than once.
  7738  	fs.closing = true
  7739  
  7740  	if writeState {
  7741  		fs.checkAndFlushAllBlocks()
  7742  	}
  7743  	fs.closeAllMsgBlocks(false)
  7744  
  7745  	fs.cancelSyncTimer()
  7746  	fs.cancelAgeChk()
  7747  
  7748  	// Release the state flusher loop.
  7749  	if fs.qch != nil {
  7750  		close(fs.qch)
  7751  		fs.qch = nil
  7752  	}
  7753  
  7754  	if writeState {
  7755  		// Wait for the state flush loop to exit.
  7756  		fsld := fs.fsld
  7757  		fs.mu.Unlock()
  7758  		<-fsld
  7759  		// Write full state if needed. If not dirty this is a no-op.
  7760  		fs.writeFullState()
  7761  		fs.mu.Lock()
  7762  	}
  7763  
  7764  	// Mark as closed. Last message block needs to be cleared after
  7765  	// writeFullState has completed.
  7766  	fs.closed = true
  7767  	fs.lmb = nil
  7768  
  7769  	// We should update the upper usage layer on a stop.
  7770  	cb, bytes := fs.scb, int64(fs.state.Bytes)
  7771  	fs.mu.Unlock()
  7772  
  7773  	fs.cmu.Lock()
  7774  	var _cfs [256]ConsumerStore
  7775  	cfs := append(_cfs[:0], fs.cfs...)
  7776  	fs.cfs = nil
  7777  	fs.cmu.Unlock()
  7778  
  7779  	for _, o := range cfs {
  7780  		o.Stop()
  7781  	}
  7782  
  7783  	if bytes > 0 && cb != nil {
  7784  		cb(0, -bytes, 0, _EMPTY_)
  7785  	}
  7786  
  7787  	return nil
  7788  }
  7789  
  7790  const errFile = "errors.txt"
  7791  
  7792  // Stream our snapshot through S2 compression and tar.
  7793  func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includeConsumers bool) {
  7794  	defer w.Close()
  7795  
  7796  	enc := s2.NewWriter(w)
  7797  	defer enc.Close()
  7798  
  7799  	tw := tar.NewWriter(enc)
  7800  	defer tw.Close()
  7801  
  7802  	defer func() {
  7803  		fs.mu.Lock()
  7804  		fs.sips--
  7805  		fs.mu.Unlock()
  7806  	}()
  7807  
  7808  	modTime := time.Now().UTC()
  7809  
  7810  	writeFile := func(name string, buf []byte) error {
  7811  		hdr := &tar.Header{
  7812  			Name:    name,
  7813  			Mode:    0600,
  7814  			ModTime: modTime,
  7815  			Uname:   "nats",
  7816  			Gname:   "nats",
  7817  			Size:    int64(len(buf)),
  7818  			Format:  tar.FormatPAX,
  7819  		}
  7820  		if err := tw.WriteHeader(hdr); err != nil {
  7821  			return err
  7822  		}
  7823  		if _, err := tw.Write(buf); err != nil {
  7824  			return err
  7825  		}
  7826  		return nil
  7827  	}
  7828  
  7829  	writeErr := func(err string) {
  7830  		writeFile(errFile, []byte(err))
  7831  	}
  7832  
  7833  	fs.mu.Lock()
  7834  	blks := fs.blks
  7835  	// Grab our general meta data.
  7836  	// We do this now instead of pulling from files since they could be encrypted.
  7837  	meta, err := json.Marshal(fs.cfg)
  7838  	if err != nil {
  7839  		fs.mu.Unlock()
  7840  		writeErr(fmt.Sprintf("Could not gather stream meta file: %v", err))
  7841  		return
  7842  	}
  7843  	hh := fs.hh
  7844  	hh.Reset()
  7845  	hh.Write(meta)
  7846  	sum := []byte(hex.EncodeToString(fs.hh.Sum(nil)))
  7847  	fs.mu.Unlock()
  7848  
  7849  	// Meta first.
  7850  	if writeFile(JetStreamMetaFile, meta) != nil {
  7851  		return
  7852  	}
  7853  	if writeFile(JetStreamMetaFileSum, sum) != nil {
  7854  		return
  7855  	}
  7856  
  7857  	// Can't use join path here, tar only recognizes relative paths with forward slashes.
  7858  	msgPre := msgDir + "/"
  7859  	var bbuf []byte
  7860  
  7861  	const minLen = 32
  7862  	sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)
  7863  	if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen {
  7864  		if fs.aek != nil {
  7865  			ns := fs.aek.NonceSize()
  7866  			buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil)
  7867  			if err == nil {
  7868  				// Redo hash checksum at end on plaintext.
  7869  				fs.mu.Lock()
  7870  				hh.Reset()
  7871  				hh.Write(buf)
  7872  				buf = fs.hh.Sum(buf)
  7873  				fs.mu.Unlock()
  7874  			}
  7875  		}
  7876  		if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil {
  7877  			return
  7878  		}
  7879  	}
  7880  
  7881  	// Now do messages themselves.
  7882  	for _, mb := range blks {
  7883  		if mb.pendingWriteSize() > 0 {
  7884  			mb.flushPendingMsgs()
  7885  		}
  7886  		mb.mu.Lock()
  7887  		// We could stream but don't want to hold the lock and prevent changes, so just read in and
  7888  		// release the lock for now.
  7889  		bbuf, err = mb.loadBlock(bbuf)
  7890  		if err != nil {
  7891  			mb.mu.Unlock()
  7892  			writeErr(fmt.Sprintf("Could not read message block [%d]: %v", mb.index, err))
  7893  			return
  7894  		}
  7895  		// Check for encryption.
  7896  		if mb.bek != nil && len(bbuf) > 0 {
  7897  			rbek, err := genBlockEncryptionKey(fs.fcfg.Cipher, mb.seed, mb.nonce)
  7898  			if err != nil {
  7899  				mb.mu.Unlock()
  7900  				writeErr(fmt.Sprintf("Could not create encryption key for message block [%d]: %v", mb.index, err))
  7901  				return
  7902  			}
  7903  			rbek.XORKeyStream(bbuf, bbuf)
  7904  		}
  7905  		// Check for compression.
  7906  		if bbuf, err = mb.decompressIfNeeded(bbuf); err != nil {
  7907  			mb.mu.Unlock()
  7908  			writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err))
  7909  			return
  7910  		}
  7911  		mb.mu.Unlock()
  7912  
  7913  		// Do this one unlocked.
  7914  		if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil {
  7915  			return
  7916  		}
  7917  	}
  7918  
  7919  	// Bail if no consumers requested.
  7920  	if !includeConsumers {
  7921  		return
  7922  	}
  7923  
  7924  	// Do consumers' state last.
  7925  	fs.cmu.RLock()
  7926  	cfs := fs.cfs
  7927  	fs.cmu.RUnlock()
  7928  
  7929  	for _, cs := range cfs {
  7930  		o, ok := cs.(*consumerFileStore)
  7931  		if !ok {
  7932  			continue
  7933  		}
  7934  		o.mu.Lock()
  7935  		// Grab our general meta data.
  7936  		// We do this now instead of pulling from files since they could be encrypted.
  7937  		meta, err := json.Marshal(o.cfg)
  7938  		if err != nil {
  7939  			o.mu.Unlock()
  7940  			writeErr(fmt.Sprintf("Could not gather consumer meta file for %q: %v", o.name, err))
  7941  			return
  7942  		}
  7943  		o.hh.Reset()
  7944  		o.hh.Write(meta)
  7945  		sum := []byte(hex.EncodeToString(o.hh.Sum(nil)))
  7946  
  7947  		// We can have the running state directly encoded now.
  7948  		state, err := o.encodeState()
  7949  		if err != nil {
  7950  			o.mu.Unlock()
  7951  			writeErr(fmt.Sprintf("Could not encode consumer state for %q: %v", o.name, err))
  7952  			return
  7953  		}
  7954  		odirPre := filepath.Join(consumerDir, o.name)
  7955  		o.mu.Unlock()
  7956  
  7957  		// Write all the consumer files.
  7958  		if writeFile(filepath.Join(odirPre, JetStreamMetaFile), meta) != nil {
  7959  			return
  7960  		}
  7961  		if writeFile(filepath.Join(odirPre, JetStreamMetaFileSum), sum) != nil {
  7962  			return
  7963  		}
  7964  		writeFile(filepath.Join(odirPre, consumerState), state)
  7965  	}
  7966  }
  7967  
  7968  // Create a snapshot of this stream and its consumer's state along with messages.
  7969  func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) {
  7970  	fs.mu.Lock()
  7971  	if fs.closed {
  7972  		fs.mu.Unlock()
  7973  		return nil, ErrStoreClosed
  7974  	}
  7975  	// Only allow one at a time.
  7976  	if fs.sips > 0 {
  7977  		fs.mu.Unlock()
  7978  		return nil, ErrStoreSnapshotInProgress
  7979  	}
  7980  	// Mark us as snapshotting
  7981  	fs.sips += 1
  7982  	fs.mu.Unlock()
  7983  
  7984  	if checkMsgs {
  7985  		ld := fs.checkMsgs()
  7986  		if ld != nil && len(ld.Msgs) > 0 {
  7987  			return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs))
  7988  		}
  7989  	}
  7990  
  7991  	// Write out full state as well before proceeding.
  7992  	fs.writeFullState()
  7993  
  7994  	pr, pw := net.Pipe()
  7995  
  7996  	// Set a write deadline here to protect ourselves.
  7997  	if deadline > 0 {
  7998  		pw.SetWriteDeadline(time.Now().Add(deadline))
  7999  	}
  8000  
  8001  	// We can add to our stream while snapshotting but not "user" delete anything.
  8002  	var state StreamState
  8003  	fs.FastState(&state)
  8004  
  8005  	// Stream in separate Go routine.
  8006  	go fs.streamSnapshot(pw, &state, includeConsumers)
  8007  
  8008  	return &SnapshotResult{pr, state}, nil
  8009  }
  8010  
  8011  // Helper to return the config.
  8012  func (fs *fileStore) fileStoreConfig() FileStoreConfig {
  8013  	fs.mu.RLock()
  8014  	defer fs.mu.RUnlock()
  8015  	return fs.fcfg
  8016  }
  8017  
  8018  // Read lock all existing message blocks.
  8019  // Lock held on entry.
  8020  func (fs *fileStore) readLockAllMsgBlocks() {
  8021  	for _, mb := range fs.blks {
  8022  		mb.mu.RLock()
  8023  	}
  8024  }
  8025  
  8026  // Read unlock all existing message blocks.
  8027  // Lock held on entry.
  8028  func (fs *fileStore) readUnlockAllMsgBlocks() {
  8029  	for _, mb := range fs.blks {
  8030  		mb.mu.RUnlock()
  8031  	}
  8032  }
  8033  
  8034  // Binary encoded state snapshot, >= v2.10 server.
  8035  func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) {
  8036  	fs.mu.RLock()
  8037  	defer fs.mu.RUnlock()
  8038  
  8039  	// Calculate deleted.
  8040  	var numDeleted int64
  8041  	if fs.state.LastSeq > fs.state.FirstSeq {
  8042  		numDeleted = int64(fs.state.LastSeq-fs.state.FirstSeq+1) - int64(fs.state.Msgs)
  8043  		if numDeleted < 0 {
  8044  			numDeleted = 0
  8045  		}
  8046  	}
  8047  
  8048  	// Encoded is Msgs, Bytes, FirstSeq, LastSeq, Failed, NumDeleted and optional DeletedBlocks
  8049  	var buf [1024]byte
  8050  	buf[0], buf[1] = streamStateMagic, streamStateVersion
  8051  	n := hdrLen
  8052  	n += binary.PutUvarint(buf[n:], fs.state.Msgs)
  8053  	n += binary.PutUvarint(buf[n:], fs.state.Bytes)
  8054  	n += binary.PutUvarint(buf[n:], fs.state.FirstSeq)
  8055  	n += binary.PutUvarint(buf[n:], fs.state.LastSeq)
  8056  	n += binary.PutUvarint(buf[n:], failed)
  8057  	n += binary.PutUvarint(buf[n:], uint64(numDeleted))
  8058  
  8059  	b := buf[0:n]
  8060  
  8061  	if numDeleted > 0 {
  8062  		var scratch [4 * 1024]byte
  8063  
  8064  		fs.readLockAllMsgBlocks()
  8065  		defer fs.readUnlockAllMsgBlocks()
  8066  
  8067  		for _, db := range fs.deleteBlocks() {
  8068  			switch db := db.(type) {
  8069  			case *DeleteRange:
  8070  				first, _, num := db.State()
  8071  				scratch[0] = runLengthMagic
  8072  				i := 1
  8073  				i += binary.PutUvarint(scratch[i:], first)
  8074  				i += binary.PutUvarint(scratch[i:], num)
  8075  				b = append(b, scratch[0:i]...)
  8076  			case *avl.SequenceSet:
  8077  				buf, err := db.Encode(scratch[:0])
  8078  				if err != nil {
  8079  					return nil, err
  8080  				}
  8081  				b = append(b, buf...)
  8082  			default:
  8083  				return nil, errors.New("no impl")
  8084  			}
  8085  		}
  8086  	}
  8087  
  8088  	return b, nil
  8089  }
  8090  
  8091  // We used to be more sophisticated to save memory, but speed is more important.
  8092  // All blocks should be at least read locked.
  8093  func (fs *fileStore) deleteBlocks() DeleteBlocks {
  8094  	var dbs DeleteBlocks
  8095  	var prevLast uint64
  8096  
  8097  	for _, mb := range fs.blks {
  8098  		// Detect if we have a gap between these blocks.
  8099  		fseq := atomic.LoadUint64(&mb.first.seq)
  8100  		if prevLast > 0 && prevLast+1 != fseq {
  8101  			dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: fseq - prevLast - 1})
  8102  		}
  8103  		if mb.dmap.Size() > 0 {
  8104  			dbs = append(dbs, &mb.dmap)
  8105  		}
  8106  		prevLast = atomic.LoadUint64(&mb.last.seq)
  8107  	}
  8108  	return dbs
  8109  }
  8110  
  8111  // SyncDeleted will make sure this stream has same deleted state as dbs.
  8112  func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) {
  8113  	if len(dbs) == 0 {
  8114  		return
  8115  	}
  8116  
  8117  	fs.mu.Lock()
  8118  	defer fs.mu.Unlock()
  8119  
  8120  	var needsCheck DeleteBlocks
  8121  
  8122  	fs.readLockAllMsgBlocks()
  8123  	mdbs := fs.deleteBlocks()
  8124  	for i, db := range dbs {
  8125  		// If the block is same as what we have we can skip.
  8126  		if i < len(mdbs) {
  8127  			first, last, num := db.State()
  8128  			eFirst, eLast, eNum := mdbs[i].State()
  8129  			if first == eFirst && last == eLast && num == eNum {
  8130  				continue
  8131  			}
  8132  		}
  8133  		// Need to insert these.
  8134  		needsCheck = append(needsCheck, db)
  8135  	}
  8136  	fs.readUnlockAllMsgBlocks()
  8137  
  8138  	for _, db := range needsCheck {
  8139  		db.Range(func(dseq uint64) bool {
  8140  			fs.removeMsg(dseq, false, true, false)
  8141  			return true
  8142  		})
  8143  	}
  8144  }
  8145  
  8146  ////////////////////////////////////////////////////////////////////////////////
  8147  // Consumers
  8148  ////////////////////////////////////////////////////////////////////////////////
  8149  
  8150  type consumerFileStore struct {
  8151  	mu      sync.Mutex
  8152  	fs      *fileStore
  8153  	cfg     *FileConsumerInfo
  8154  	prf     keyGen
  8155  	aek     cipher.AEAD
  8156  	name    string
  8157  	odir    string
  8158  	ifn     string
  8159  	hh      hash.Hash64
  8160  	state   ConsumerState
  8161  	fch     chan struct{}
  8162  	qch     chan struct{}
  8163  	flusher bool
  8164  	writing bool
  8165  	dirty   bool
  8166  	closed  bool
  8167  }
  8168  
  8169  func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) {
  8170  	if fs == nil {
  8171  		return nil, fmt.Errorf("filestore is nil")
  8172  	}
  8173  	if fs.isClosed() {
  8174  		return nil, ErrStoreClosed
  8175  	}
  8176  	if cfg == nil || name == _EMPTY_ {
  8177  		return nil, fmt.Errorf("bad consumer config")
  8178  	}
  8179  
  8180  	// We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store.
  8181  	if cfg.MemoryStorage {
  8182  		// Create directly here.
  8183  		o := &consumerMemStore{ms: fs, cfg: *cfg}
  8184  		fs.AddConsumer(o)
  8185  		return o, nil
  8186  	}
  8187  
  8188  	odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name)
  8189  	if err := os.MkdirAll(odir, defaultDirPerms); err != nil {
  8190  		return nil, fmt.Errorf("could not create consumer directory - %v", err)
  8191  	}
  8192  	csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg}
  8193  	o := &consumerFileStore{
  8194  		fs:   fs,
  8195  		cfg:  csi,
  8196  		prf:  fs.prf,
  8197  		name: name,
  8198  		odir: odir,
  8199  		ifn:  filepath.Join(odir, consumerState),
  8200  	}
  8201  	key := sha256.Sum256([]byte(fs.cfg.Name + "/" + name))
  8202  	hh, err := highwayhash.New64(key[:])
  8203  	if err != nil {
  8204  		return nil, fmt.Errorf("could not create hash: %v", err)
  8205  	}
  8206  	o.hh = hh
  8207  
  8208  	// Check for encryption.
  8209  	if o.prf != nil {
  8210  		if ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)); err == nil {
  8211  			if len(ekey) < minBlkKeySize {
  8212  				return nil, errBadKeySize
  8213  			}
  8214  			// Recover key encryption key.
  8215  			rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
  8216  			if err != nil {
  8217  				return nil, err
  8218  			}
  8219  
  8220  			sc := fs.fcfg.Cipher
  8221  			kek, err := genEncryptionKey(sc, rb)
  8222  			if err != nil {
  8223  				return nil, err
  8224  			}
  8225  			ns := kek.NonceSize()
  8226  			nonce := ekey[:ns]
  8227  			seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
  8228  			if err != nil {
  8229  				// We may be here on a cipher conversion, so attempt to convert.
  8230  				if err = o.convertCipher(); err != nil {
  8231  					return nil, err
  8232  				}
  8233  			} else {
  8234  				o.aek, err = genEncryptionKey(sc, seed)
  8235  			}
  8236  			if err != nil {
  8237  				return nil, err
  8238  			}
  8239  		}
  8240  	}
  8241  
  8242  	// Track if we are creating the directory so that we can clean up if we encounter an error.
  8243  	var didCreate bool
  8244  
  8245  	// Write our meta data iff does not exist.
  8246  	meta := filepath.Join(odir, JetStreamMetaFile)
  8247  	if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) {
  8248  		didCreate = true
  8249  		csi.Created = time.Now().UTC()
  8250  		if err := o.writeConsumerMeta(); err != nil {
  8251  			os.RemoveAll(odir)
  8252  			return nil, err
  8253  		}
  8254  	}
  8255  
  8256  	// If we expect to be encrypted check that what we are restoring is not plaintext.
  8257  	// This can happen on snapshot restores or conversions.
  8258  	if o.prf != nil {
  8259  		keyFile := filepath.Join(odir, JetStreamMetaFileKey)
  8260  		if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) {
  8261  			if err := o.writeConsumerMeta(); err != nil {
  8262  				if didCreate {
  8263  					os.RemoveAll(odir)
  8264  				}
  8265  				return nil, err
  8266  			}
  8267  			// Redo the state file as well here if we have one and we can tell it was plaintext.
  8268  			if buf, err := os.ReadFile(o.ifn); err == nil {
  8269  				if _, err := decodeConsumerState(buf); err == nil {
  8270  					<-dios
  8271  					err := os.WriteFile(o.ifn, o.encryptState(buf), defaultFilePerms)
  8272  					dios <- struct{}{}
  8273  					if err != nil {
  8274  						if didCreate {
  8275  							os.RemoveAll(odir)
  8276  						}
  8277  						return nil, err
  8278  					}
  8279  				}
  8280  			}
  8281  		}
  8282  	}
  8283  
  8284  	// Create channels to control our flush go routine.
  8285  	o.fch = make(chan struct{}, 1)
  8286  	o.qch = make(chan struct{})
  8287  	go o.flushLoop(o.fch, o.qch)
  8288  
  8289  	// Make sure to load in our state from disk if needed.
  8290  	o.loadState()
  8291  
  8292  	// Assign to filestore.
  8293  	fs.AddConsumer(o)
  8294  
  8295  	return o, nil
  8296  }
  8297  
  8298  func (o *consumerFileStore) convertCipher() error {
  8299  	fs := o.fs
  8300  	odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, o.name)
  8301  
  8302  	ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey))
  8303  	if err != nil {
  8304  		return err
  8305  	}
  8306  	if len(ekey) < minBlkKeySize {
  8307  		return errBadKeySize
  8308  	}
  8309  	// Recover key encryption key.
  8310  	rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name))
  8311  	if err != nil {
  8312  		return err
  8313  	}
  8314  
  8315  	// Do these in reverse since converting.
  8316  	sc := fs.fcfg.Cipher
  8317  	osc := AES
  8318  	if sc == AES {
  8319  		osc = ChaCha
  8320  	}
  8321  	kek, err := genEncryptionKey(osc, rb)
  8322  	if err != nil {
  8323  		return err
  8324  	}
  8325  	ns := kek.NonceSize()
  8326  	nonce := ekey[:ns]
  8327  	seed, err := kek.Open(nil, nonce, ekey[ns:], nil)
  8328  	if err != nil {
  8329  		return err
  8330  	}
  8331  	aek, err := genEncryptionKey(osc, seed)
  8332  	if err != nil {
  8333  		return err
  8334  	}
  8335  	// Now read in and decode our state using the old cipher.
  8336  	buf, err := os.ReadFile(o.ifn)
  8337  	if err != nil {
  8338  		return err
  8339  	}
  8340  	buf, err = aek.Open(nil, buf[:ns], buf[ns:], nil)
  8341  	if err != nil {
  8342  		return err
  8343  	}
  8344  
  8345  	// Since we are here we recovered our old state.
  8346  	// Now write our meta, which will generate the new keys with the new cipher.
  8347  	if err := o.writeConsumerMeta(); err != nil {
  8348  		return err
  8349  	}
  8350  
  8351  	// Now write out or state with the new cipher.
  8352  	return o.writeState(buf)
  8353  }
  8354  
  8355  // Kick flusher for this consumer.
  8356  // Lock should be held.
  8357  func (o *consumerFileStore) kickFlusher() {
  8358  	if o.fch != nil {
  8359  		select {
  8360  		case o.fch <- struct{}{}:
  8361  		default:
  8362  		}
  8363  	}
  8364  	o.dirty = true
  8365  }
  8366  
  8367  // Set in flusher status
  8368  func (o *consumerFileStore) setInFlusher() {
  8369  	o.mu.Lock()
  8370  	o.flusher = true
  8371  	o.mu.Unlock()
  8372  }
  8373  
  8374  // Clear in flusher status
  8375  func (o *consumerFileStore) clearInFlusher() {
  8376  	o.mu.Lock()
  8377  	o.flusher = false
  8378  	o.mu.Unlock()
  8379  }
  8380  
  8381  // Report in flusher status
  8382  func (o *consumerFileStore) inFlusher() bool {
  8383  	o.mu.Lock()
  8384  	defer o.mu.Unlock()
  8385  	return o.flusher
  8386  }
  8387  
  8388  // flushLoop watches for consumer updates and the quit channel.
  8389  func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) {
  8390  
  8391  	o.setInFlusher()
  8392  	defer o.clearInFlusher()
  8393  
  8394  	// Maintain approximately 10 updates per second per consumer under load.
  8395  	const minTime = 100 * time.Millisecond
  8396  	var lastWrite time.Time
  8397  	var dt *time.Timer
  8398  
  8399  	setDelayTimer := func(addWait time.Duration) {
  8400  		if dt == nil {
  8401  			dt = time.NewTimer(addWait)
  8402  			return
  8403  		}
  8404  		if !dt.Stop() {
  8405  			select {
  8406  			case <-dt.C:
  8407  			default:
  8408  			}
  8409  		}
  8410  		dt.Reset(addWait)
  8411  	}
  8412  
  8413  	for {
  8414  		select {
  8415  		case <-fch:
  8416  			if ts := time.Since(lastWrite); ts < minTime {
  8417  				setDelayTimer(minTime - ts)
  8418  				select {
  8419  				case <-dt.C:
  8420  				case <-qch:
  8421  					return
  8422  				}
  8423  			}
  8424  			o.mu.Lock()
  8425  			if o.closed {
  8426  				o.mu.Unlock()
  8427  				return
  8428  			}
  8429  			buf, err := o.encodeState()
  8430  			o.mu.Unlock()
  8431  			if err != nil {
  8432  				return
  8433  			}
  8434  			// TODO(dlc) - if we error should start failing upwards.
  8435  			if err := o.writeState(buf); err == nil {
  8436  				lastWrite = time.Now()
  8437  			}
  8438  		case <-qch:
  8439  			return
  8440  		}
  8441  	}
  8442  }
  8443  
  8444  // SetStarting sets our starting stream sequence.
  8445  func (o *consumerFileStore) SetStarting(sseq uint64) error {
  8446  	o.mu.Lock()
  8447  	o.state.Delivered.Stream = sseq
  8448  	buf, err := o.encodeState()
  8449  	o.mu.Unlock()
  8450  	if err != nil {
  8451  		return err
  8452  	}
  8453  	return o.writeState(buf)
  8454  }
  8455  
  8456  // HasState returns if this store has a recorded state.
  8457  func (o *consumerFileStore) HasState() bool {
  8458  	o.mu.Lock()
  8459  	_, err := os.Stat(o.ifn)
  8460  	o.mu.Unlock()
  8461  	return err == nil
  8462  }
  8463  
  8464  // UpdateDelivered is called whenever a new message has been delivered.
  8465  func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error {
  8466  	o.mu.Lock()
  8467  	defer o.mu.Unlock()
  8468  
  8469  	if dc != 1 && o.cfg.AckPolicy == AckNone {
  8470  		return ErrNoAckPolicy
  8471  	}
  8472  
  8473  	// On restarts the old leader may get a replay from the raft logs that are old.
  8474  	if dseq <= o.state.AckFloor.Consumer {
  8475  		return nil
  8476  	}
  8477  
  8478  	// See if we expect an ack for this.
  8479  	if o.cfg.AckPolicy != AckNone {
  8480  		// Need to create pending records here.
  8481  		if o.state.Pending == nil {
  8482  			o.state.Pending = make(map[uint64]*Pending)
  8483  		}
  8484  		var p *Pending
  8485  		// Check for an update to a message already delivered.
  8486  		if sseq <= o.state.Delivered.Stream {
  8487  			if p = o.state.Pending[sseq]; p != nil {
  8488  				p.Sequence, p.Timestamp = dseq, ts
  8489  			}
  8490  		} else {
  8491  			// Add to pending.
  8492  			o.state.Pending[sseq] = &Pending{dseq, ts}
  8493  		}
  8494  		// Update delivered as needed.
  8495  		if dseq > o.state.Delivered.Consumer {
  8496  			o.state.Delivered.Consumer = dseq
  8497  		}
  8498  		if sseq > o.state.Delivered.Stream {
  8499  			o.state.Delivered.Stream = sseq
  8500  		}
  8501  
  8502  		if dc > 1 {
  8503  			if maxdc := uint64(o.cfg.MaxDeliver); maxdc > 0 && dc > maxdc {
  8504  				// Make sure to remove from pending.
  8505  				delete(o.state.Pending, sseq)
  8506  			}
  8507  			if o.state.Redelivered == nil {
  8508  				o.state.Redelivered = make(map[uint64]uint64)
  8509  			}
  8510  			// Only update if greater then what we already have.
  8511  			if o.state.Redelivered[sseq] < dc-1 {
  8512  				o.state.Redelivered[sseq] = dc - 1
  8513  			}
  8514  		}
  8515  	} else {
  8516  		// For AckNone just update delivered and ackfloor at the same time.
  8517  		if dseq > o.state.Delivered.Consumer {
  8518  			o.state.Delivered.Consumer = dseq
  8519  			o.state.AckFloor.Consumer = dseq
  8520  		}
  8521  		if sseq > o.state.Delivered.Stream {
  8522  			o.state.Delivered.Stream = sseq
  8523  			o.state.AckFloor.Stream = sseq
  8524  		}
  8525  	}
  8526  	// Make sure we flush to disk.
  8527  	o.kickFlusher()
  8528  
  8529  	return nil
  8530  }
  8531  
  8532  // UpdateAcks is called whenever a consumer with explicit ack or ack all acks a message.
  8533  func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
  8534  	o.mu.Lock()
  8535  	defer o.mu.Unlock()
  8536  
  8537  	if o.cfg.AckPolicy == AckNone {
  8538  		return ErrNoAckPolicy
  8539  	}
  8540  
  8541  	// On restarts the old leader may get a replay from the raft logs that are old.
  8542  	if dseq <= o.state.AckFloor.Consumer {
  8543  		return nil
  8544  	}
  8545  
  8546  	if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
  8547  		return ErrStoreMsgNotFound
  8548  	}
  8549  
  8550  	// Check for AckAll here.
  8551  	if o.cfg.AckPolicy == AckAll {
  8552  		sgap := sseq - o.state.AckFloor.Stream
  8553  		o.state.AckFloor.Consumer = dseq
  8554  		o.state.AckFloor.Stream = sseq
  8555  		for seq := sseq; seq > sseq-sgap; seq-- {
  8556  			delete(o.state.Pending, seq)
  8557  			if len(o.state.Redelivered) > 0 {
  8558  				delete(o.state.Redelivered, seq)
  8559  			}
  8560  		}
  8561  		o.kickFlusher()
  8562  		return nil
  8563  	}
  8564  
  8565  	// AckExplicit
  8566  
  8567  	// First delete from our pending state.
  8568  	if p, ok := o.state.Pending[sseq]; ok {
  8569  		delete(o.state.Pending, sseq)
  8570  		dseq = p.Sequence // Use the original.
  8571  	}
  8572  	if len(o.state.Pending) == 0 {
  8573  		o.state.AckFloor.Consumer = o.state.Delivered.Consumer
  8574  		o.state.AckFloor.Stream = o.state.Delivered.Stream
  8575  	} else if dseq == o.state.AckFloor.Consumer+1 {
  8576  		o.state.AckFloor.Consumer = dseq
  8577  		o.state.AckFloor.Stream = sseq
  8578  
  8579  		if o.state.Delivered.Consumer > dseq {
  8580  			for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ {
  8581  				if p, ok := o.state.Pending[ss]; ok {
  8582  					if p.Sequence > 0 {
  8583  						o.state.AckFloor.Consumer = p.Sequence - 1
  8584  						o.state.AckFloor.Stream = ss - 1
  8585  					}
  8586  					break
  8587  				}
  8588  			}
  8589  		}
  8590  	}
  8591  	// We do these regardless.
  8592  	delete(o.state.Redelivered, sseq)
  8593  
  8594  	o.kickFlusher()
  8595  	return nil
  8596  }
  8597  
  8598  const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen
  8599  
  8600  // Encode our consumer state, version 2.
  8601  // Lock should be held.
  8602  
  8603  func (o *consumerFileStore) EncodedState() ([]byte, error) {
  8604  	o.mu.Lock()
  8605  	defer o.mu.Unlock()
  8606  	return o.encodeState()
  8607  }
  8608  
  8609  func (o *consumerFileStore) encodeState() ([]byte, error) {
  8610  	// Grab reference to state, but make sure we load in if needed, so do not reference o.state directly.
  8611  	state, err := o.stateWithCopyLocked(false)
  8612  	if err != nil {
  8613  		return nil, err
  8614  	}
  8615  	return encodeConsumerState(state), nil
  8616  }
  8617  
  8618  func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error {
  8619  	o.mu.Lock()
  8620  	defer o.mu.Unlock()
  8621  
  8622  	// This is mostly unchecked here. We are assuming the upper layers have done sanity checking.
  8623  	csi := o.cfg
  8624  	csi.ConsumerConfig = *cfg
  8625  
  8626  	return o.writeConsumerMeta()
  8627  }
  8628  
  8629  func (o *consumerFileStore) Update(state *ConsumerState) error {
  8630  	o.mu.Lock()
  8631  	defer o.mu.Unlock()
  8632  
  8633  	// Check to see if this is an outdated update.
  8634  	if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream {
  8635  		return nil
  8636  	}
  8637  
  8638  	// Sanity checks.
  8639  	if state.AckFloor.Consumer > state.Delivered.Consumer {
  8640  		return fmt.Errorf("bad ack floor for consumer")
  8641  	}
  8642  	if state.AckFloor.Stream > state.Delivered.Stream {
  8643  		return fmt.Errorf("bad ack floor for stream")
  8644  	}
  8645  
  8646  	// Copy to our state.
  8647  	var pending map[uint64]*Pending
  8648  	var redelivered map[uint64]uint64
  8649  	if len(state.Pending) > 0 {
  8650  		pending = make(map[uint64]*Pending, len(state.Pending))
  8651  		for seq, p := range state.Pending {
  8652  			pending[seq] = &Pending{p.Sequence, p.Timestamp}
  8653  			if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream {
  8654  				return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq)
  8655  			}
  8656  		}
  8657  	}
  8658  	if len(state.Redelivered) > 0 {
  8659  		redelivered = make(map[uint64]uint64, len(state.Redelivered))
  8660  		for seq, dc := range state.Redelivered {
  8661  			redelivered[seq] = dc
  8662  		}
  8663  	}
  8664  
  8665  	o.state.Delivered = state.Delivered
  8666  	o.state.AckFloor = state.AckFloor
  8667  	o.state.Pending = pending
  8668  	o.state.Redelivered = redelivered
  8669  
  8670  	o.kickFlusher()
  8671  
  8672  	return nil
  8673  }
  8674  
  8675  // Will encrypt the state with our asset key. Will be a no-op if encryption not enabled.
  8676  // Lock should be held.
  8677  func (o *consumerFileStore) encryptState(buf []byte) []byte {
  8678  	if o.aek == nil {
  8679  		return buf
  8680  	}
  8681  	// TODO(dlc) - Optimize on space usage a bit?
  8682  	nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead())
  8683  	rand.Read(nonce)
  8684  	return o.aek.Seal(nonce, nonce, buf, nil)
  8685  }
  8686  
  8687  // Used to limit number of disk IO calls in flight since they could all be blocking an OS thread.
  8688  // https://github.com/nats-io/nats-server/issues/2742
  8689  var dios chan struct{}
  8690  
  8691  // Used to setup our simplistic counting semaphore using buffered channels.
  8692  // golang.org's semaphore seemed a bit heavy.
  8693  func init() {
  8694  	// Limit ourselves to a max of 4 blocking IO calls.
  8695  	const nIO = 4
  8696  	dios = make(chan struct{}, nIO)
  8697  	// Fill it up to start.
  8698  	for i := 0; i < nIO; i++ {
  8699  		dios <- struct{}{}
  8700  	}
  8701  }
  8702  
  8703  func (o *consumerFileStore) writeState(buf []byte) error {
  8704  	// Check if we have the index file open.
  8705  	o.mu.Lock()
  8706  	if o.writing || len(buf) == 0 {
  8707  		o.mu.Unlock()
  8708  		return nil
  8709  	}
  8710  
  8711  	// Check on encryption.
  8712  	if o.aek != nil {
  8713  		buf = o.encryptState(buf)
  8714  	}
  8715  
  8716  	o.writing = true
  8717  	o.dirty = false
  8718  	ifn := o.ifn
  8719  	o.mu.Unlock()
  8720  
  8721  	// Lock not held here but we do limit number of outstanding calls that could block OS threads.
  8722  	<-dios
  8723  	err := os.WriteFile(ifn, buf, defaultFilePerms)
  8724  	dios <- struct{}{}
  8725  
  8726  	o.mu.Lock()
  8727  	if err != nil {
  8728  		o.dirty = true
  8729  	}
  8730  	o.writing = false
  8731  	o.mu.Unlock()
  8732  
  8733  	return err
  8734  }
  8735  
  8736  // Will upodate the config. Only used when recovering ephemerals.
  8737  func (o *consumerFileStore) updateConfig(cfg ConsumerConfig) error {
  8738  	o.mu.Lock()
  8739  	defer o.mu.Unlock()
  8740  	o.cfg = &FileConsumerInfo{ConsumerConfig: cfg}
  8741  	return o.writeConsumerMeta()
  8742  }
  8743  
  8744  // Write out the consumer meta data, i.e. state.
  8745  // Lock should be held.
  8746  func (cfs *consumerFileStore) writeConsumerMeta() error {
  8747  	meta := filepath.Join(cfs.odir, JetStreamMetaFile)
  8748  	if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) {
  8749  		return err
  8750  	}
  8751  
  8752  	if cfs.prf != nil && cfs.aek == nil {
  8753  		fs := cfs.fs
  8754  		key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name + tsep + cfs.name)
  8755  		if err != nil {
  8756  			return err
  8757  		}
  8758  		cfs.aek = key
  8759  		keyFile := filepath.Join(cfs.odir, JetStreamMetaFileKey)
  8760  		if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) {
  8761  			return err
  8762  		}
  8763  		<-dios
  8764  		err = os.WriteFile(keyFile, encrypted, defaultFilePerms)
  8765  		dios <- struct{}{}
  8766  		if err != nil {
  8767  			return err
  8768  		}
  8769  	}
  8770  
  8771  	b, err := json.Marshal(cfs.cfg)
  8772  	if err != nil {
  8773  		return err
  8774  	}
  8775  	// Encrypt if needed.
  8776  	if cfs.aek != nil {
  8777  		nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead())
  8778  		rand.Read(nonce)
  8779  		b = cfs.aek.Seal(nonce, nonce, b, nil)
  8780  	}
  8781  
  8782  	<-dios
  8783  	err = os.WriteFile(meta, b, defaultFilePerms)
  8784  	dios <- struct{}{}
  8785  	if err != nil {
  8786  		return err
  8787  	}
  8788  	cfs.hh.Reset()
  8789  	cfs.hh.Write(b)
  8790  	checksum := hex.EncodeToString(cfs.hh.Sum(nil))
  8791  	sum := filepath.Join(cfs.odir, JetStreamMetaFileSum)
  8792  
  8793  	<-dios
  8794  	err = os.WriteFile(sum, []byte(checksum), defaultFilePerms)
  8795  	dios <- struct{}{}
  8796  	if err != nil {
  8797  		return err
  8798  	}
  8799  	return nil
  8800  }
  8801  
  8802  // Consumer version.
  8803  func checkConsumerHeader(hdr []byte) (uint8, error) {
  8804  	if hdr == nil || len(hdr) < 2 || hdr[0] != magic {
  8805  		return 0, errCorruptState
  8806  	}
  8807  	version := hdr[1]
  8808  	switch version {
  8809  	case 1, 2:
  8810  		return version, nil
  8811  	}
  8812  	return 0, fmt.Errorf("unsupported version: %d", version)
  8813  }
  8814  
  8815  func (o *consumerFileStore) copyPending() map[uint64]*Pending {
  8816  	pending := make(map[uint64]*Pending, len(o.state.Pending))
  8817  	for seq, p := range o.state.Pending {
  8818  		pending[seq] = &Pending{p.Sequence, p.Timestamp}
  8819  	}
  8820  	return pending
  8821  }
  8822  
  8823  func (o *consumerFileStore) copyRedelivered() map[uint64]uint64 {
  8824  	redelivered := make(map[uint64]uint64, len(o.state.Redelivered))
  8825  	for seq, dc := range o.state.Redelivered {
  8826  		redelivered[seq] = dc
  8827  	}
  8828  	return redelivered
  8829  }
  8830  
  8831  // Type returns the type of the underlying store.
  8832  func (o *consumerFileStore) Type() StorageType { return FileStorage }
  8833  
  8834  // State retrieves the state from the state file.
  8835  // This is not expected to be called in high performance code, only on startup.
  8836  func (o *consumerFileStore) State() (*ConsumerState, error) {
  8837  	return o.stateWithCopy(true)
  8838  }
  8839  
  8840  // This will not copy pending or redelivered, so should only be done under the
  8841  // consumer owner's lock.
  8842  func (o *consumerFileStore) BorrowState() (*ConsumerState, error) {
  8843  	return o.stateWithCopy(false)
  8844  }
  8845  
  8846  func (o *consumerFileStore) stateWithCopy(doCopy bool) (*ConsumerState, error) {
  8847  	o.mu.Lock()
  8848  	defer o.mu.Unlock()
  8849  	return o.stateWithCopyLocked(doCopy)
  8850  }
  8851  
  8852  // Lock should be held.
  8853  func (o *consumerFileStore) stateWithCopyLocked(doCopy bool) (*ConsumerState, error) {
  8854  	if o.closed {
  8855  		return nil, ErrStoreClosed
  8856  	}
  8857  
  8858  	state := &ConsumerState{}
  8859  
  8860  	// See if we have a running state or if we need to read in from disk.
  8861  	if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 {
  8862  		state.Delivered = o.state.Delivered
  8863  		state.AckFloor = o.state.AckFloor
  8864  		if len(o.state.Pending) > 0 {
  8865  			if doCopy {
  8866  				state.Pending = o.copyPending()
  8867  			} else {
  8868  				state.Pending = o.state.Pending
  8869  			}
  8870  		}
  8871  		if len(o.state.Redelivered) > 0 {
  8872  			if doCopy {
  8873  				state.Redelivered = o.copyRedelivered()
  8874  			} else {
  8875  				state.Redelivered = o.state.Redelivered
  8876  			}
  8877  		}
  8878  		return state, nil
  8879  	}
  8880  
  8881  	// Read the state in here from disk..
  8882  	<-dios
  8883  	buf, err := os.ReadFile(o.ifn)
  8884  	dios <- struct{}{}
  8885  
  8886  	if err != nil && !os.IsNotExist(err) {
  8887  		return nil, err
  8888  	}
  8889  
  8890  	if len(buf) == 0 {
  8891  		return state, nil
  8892  	}
  8893  
  8894  	// Check on encryption.
  8895  	if o.aek != nil {
  8896  		ns := o.aek.NonceSize()
  8897  		buf, err = o.aek.Open(nil, buf[:ns], buf[ns:], nil)
  8898  		if err != nil {
  8899  			return nil, err
  8900  		}
  8901  	}
  8902  
  8903  	state, err = decodeConsumerState(buf)
  8904  	if err != nil {
  8905  		return nil, err
  8906  	}
  8907  
  8908  	// Copy this state into our own.
  8909  	o.state.Delivered = state.Delivered
  8910  	o.state.AckFloor = state.AckFloor
  8911  	if len(state.Pending) > 0 {
  8912  		if doCopy {
  8913  			o.state.Pending = make(map[uint64]*Pending, len(state.Pending))
  8914  			for seq, p := range state.Pending {
  8915  				o.state.Pending[seq] = &Pending{p.Sequence, p.Timestamp}
  8916  			}
  8917  		} else {
  8918  			o.state.Pending = state.Pending
  8919  		}
  8920  	}
  8921  	if len(state.Redelivered) > 0 {
  8922  		if doCopy {
  8923  			o.state.Redelivered = make(map[uint64]uint64, len(state.Redelivered))
  8924  			for seq, dc := range state.Redelivered {
  8925  				o.state.Redelivered[seq] = dc
  8926  			}
  8927  		} else {
  8928  			o.state.Redelivered = state.Redelivered
  8929  		}
  8930  	}
  8931  
  8932  	return state, nil
  8933  }
  8934  
  8935  // Lock should be held. Called at startup.
  8936  func (o *consumerFileStore) loadState() {
  8937  	if _, err := os.Stat(o.ifn); err == nil {
  8938  		// This will load our state in from disk.
  8939  		o.stateWithCopyLocked(false)
  8940  	}
  8941  }
  8942  
  8943  // Decode consumer state.
  8944  func decodeConsumerState(buf []byte) (*ConsumerState, error) {
  8945  	version, err := checkConsumerHeader(buf)
  8946  	if err != nil {
  8947  		return nil, err
  8948  	}
  8949  
  8950  	bi := hdrLen
  8951  	// Helpers, will set i to -1 on error.
  8952  	readSeq := func() uint64 {
  8953  		if bi < 0 {
  8954  			return 0
  8955  		}
  8956  		seq, n := binary.Uvarint(buf[bi:])
  8957  		if n <= 0 {
  8958  			bi = -1
  8959  			return 0
  8960  		}
  8961  		bi += n
  8962  		return seq
  8963  	}
  8964  	readTimeStamp := func() int64 {
  8965  		if bi < 0 {
  8966  			return 0
  8967  		}
  8968  		ts, n := binary.Varint(buf[bi:])
  8969  		if n <= 0 {
  8970  			bi = -1
  8971  			return -1
  8972  		}
  8973  		bi += n
  8974  		return ts
  8975  	}
  8976  	// Just for clarity below.
  8977  	readLen := readSeq
  8978  	readCount := readSeq
  8979  
  8980  	state := &ConsumerState{}
  8981  	state.AckFloor.Consumer = readSeq()
  8982  	state.AckFloor.Stream = readSeq()
  8983  	state.Delivered.Consumer = readSeq()
  8984  	state.Delivered.Stream = readSeq()
  8985  
  8986  	if bi == -1 {
  8987  		return nil, errCorruptState
  8988  	}
  8989  	if version == 1 {
  8990  		// Adjust back. Version 1 also stored delivered as next to be delivered,
  8991  		// so adjust that back down here.
  8992  		if state.AckFloor.Consumer > 1 {
  8993  			state.Delivered.Consumer += state.AckFloor.Consumer - 1
  8994  		}
  8995  		if state.AckFloor.Stream > 1 {
  8996  			state.Delivered.Stream += state.AckFloor.Stream - 1
  8997  		}
  8998  	}
  8999  
  9000  	// Protect ourselves against rolling backwards.
  9001  	const hbit = 1 << 63
  9002  	if state.AckFloor.Stream&hbit != 0 || state.Delivered.Stream&hbit != 0 {
  9003  		return nil, errCorruptState
  9004  	}
  9005  
  9006  	// We have additional stuff.
  9007  	if numPending := readLen(); numPending > 0 {
  9008  		mints := readTimeStamp()
  9009  		state.Pending = make(map[uint64]*Pending, numPending)
  9010  		for i := 0; i < int(numPending); i++ {
  9011  			sseq := readSeq()
  9012  			var dseq uint64
  9013  			if version == 2 {
  9014  				dseq = readSeq()
  9015  			}
  9016  			ts := readTimeStamp()
  9017  			// Check the state machine for corruption, not the value which could be -1.
  9018  			if bi == -1 {
  9019  				return nil, errCorruptState
  9020  			}
  9021  			// Adjust seq back.
  9022  			sseq += state.AckFloor.Stream
  9023  			if sseq == 0 {
  9024  				return nil, errCorruptState
  9025  			}
  9026  			if version == 2 {
  9027  				dseq += state.AckFloor.Consumer
  9028  			}
  9029  			// Adjust the timestamp back.
  9030  			if version == 1 {
  9031  				ts = (ts + mints) * int64(time.Second)
  9032  			} else {
  9033  				ts = (mints - ts) * int64(time.Second)
  9034  			}
  9035  			// Store in pending.
  9036  			state.Pending[sseq] = &Pending{dseq, ts}
  9037  		}
  9038  	}
  9039  
  9040  	// We have redelivered entries here.
  9041  	if numRedelivered := readLen(); numRedelivered > 0 {
  9042  		state.Redelivered = make(map[uint64]uint64, numRedelivered)
  9043  		for i := 0; i < int(numRedelivered); i++ {
  9044  			if seq, n := readSeq(), readCount(); seq > 0 && n > 0 {
  9045  				// Adjust seq back.
  9046  				seq += state.AckFloor.Stream
  9047  				state.Redelivered[seq] = n
  9048  			}
  9049  		}
  9050  	}
  9051  
  9052  	return state, nil
  9053  }
  9054  
  9055  // Stop the processing of the consumers's state.
  9056  func (o *consumerFileStore) Stop() error {
  9057  	o.mu.Lock()
  9058  	if o.closed {
  9059  		o.mu.Unlock()
  9060  		return nil
  9061  	}
  9062  	if o.qch != nil {
  9063  		close(o.qch)
  9064  		o.qch = nil
  9065  	}
  9066  
  9067  	var err error
  9068  	var buf []byte
  9069  
  9070  	if o.dirty {
  9071  		// Make sure to write this out..
  9072  		if buf, err = o.encodeState(); err == nil && len(buf) > 0 {
  9073  			if o.aek != nil {
  9074  				buf = o.encryptState(buf)
  9075  			}
  9076  		}
  9077  	}
  9078  
  9079  	o.odir = _EMPTY_
  9080  	o.closed = true
  9081  	ifn, fs := o.ifn, o.fs
  9082  	o.mu.Unlock()
  9083  
  9084  	fs.RemoveConsumer(o)
  9085  
  9086  	if len(buf) > 0 {
  9087  		o.waitOnFlusher()
  9088  		<-dios
  9089  		err = os.WriteFile(ifn, buf, defaultFilePerms)
  9090  		dios <- struct{}{}
  9091  	}
  9092  	return err
  9093  }
  9094  
  9095  func (o *consumerFileStore) waitOnFlusher() {
  9096  	if !o.inFlusher() {
  9097  		return
  9098  	}
  9099  
  9100  	timeout := time.Now().Add(100 * time.Millisecond)
  9101  	for time.Now().Before(timeout) {
  9102  		if !o.inFlusher() {
  9103  			return
  9104  		}
  9105  		time.Sleep(10 * time.Millisecond)
  9106  	}
  9107  }
  9108  
  9109  // Delete the consumer.
  9110  func (o *consumerFileStore) Delete() error {
  9111  	return o.delete(false)
  9112  }
  9113  
  9114  func (o *consumerFileStore) StreamDelete() error {
  9115  	return o.delete(true)
  9116  }
  9117  
  9118  func (o *consumerFileStore) delete(streamDeleted bool) error {
  9119  	o.mu.Lock()
  9120  	if o.closed {
  9121  		o.mu.Unlock()
  9122  		return nil
  9123  	}
  9124  	if o.qch != nil {
  9125  		close(o.qch)
  9126  		o.qch = nil
  9127  	}
  9128  
  9129  	var err error
  9130  	odir := o.odir
  9131  	o.odir = _EMPTY_
  9132  	o.closed = true
  9133  	fs := o.fs
  9134  	o.mu.Unlock()
  9135  
  9136  	// If our stream was not deleted this will remove the directories.
  9137  	if odir != _EMPTY_ && !streamDeleted {
  9138  		<-dios
  9139  		err = os.RemoveAll(odir)
  9140  		dios <- struct{}{}
  9141  	}
  9142  
  9143  	if !streamDeleted {
  9144  		fs.RemoveConsumer(o)
  9145  	}
  9146  
  9147  	return err
  9148  }
  9149  
  9150  func (fs *fileStore) AddConsumer(o ConsumerStore) error {
  9151  	fs.cmu.Lock()
  9152  	defer fs.cmu.Unlock()
  9153  	fs.cfs = append(fs.cfs, o)
  9154  	return nil
  9155  }
  9156  
  9157  func (fs *fileStore) RemoveConsumer(o ConsumerStore) error {
  9158  	fs.cmu.Lock()
  9159  	defer fs.cmu.Unlock()
  9160  	for i, cfs := range fs.cfs {
  9161  		if o == cfs {
  9162  			fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...)
  9163  			break
  9164  		}
  9165  	}
  9166  	return nil
  9167  }
  9168  
  9169  ////////////////////////////////////////////////////////////////////////////////
  9170  // Templates
  9171  ////////////////////////////////////////////////////////////////////////////////
  9172  
  9173  type templateFileStore struct {
  9174  	dir string
  9175  	hh  hash.Hash64
  9176  }
  9177  
  9178  func newTemplateFileStore(storeDir string) *templateFileStore {
  9179  	tdir := filepath.Join(storeDir, tmplsDir)
  9180  	key := sha256.Sum256([]byte("templates"))
  9181  	hh, err := highwayhash.New64(key[:])
  9182  	if err != nil {
  9183  		return nil
  9184  	}
  9185  	return &templateFileStore{dir: tdir, hh: hh}
  9186  }
  9187  
  9188  func (ts *templateFileStore) Store(t *streamTemplate) error {
  9189  	dir := filepath.Join(ts.dir, t.Name)
  9190  	if err := os.MkdirAll(dir, defaultDirPerms); err != nil {
  9191  		return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err)
  9192  	}
  9193  	meta := filepath.Join(dir, JetStreamMetaFile)
  9194  	if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil {
  9195  		return err
  9196  	}
  9197  	t.mu.Lock()
  9198  	b, err := json.Marshal(t)
  9199  	t.mu.Unlock()
  9200  	if err != nil {
  9201  		return err
  9202  	}
  9203  	if err := os.WriteFile(meta, b, defaultFilePerms); err != nil {
  9204  		return err
  9205  	}
  9206  	// FIXME(dlc) - Do checksum
  9207  	ts.hh.Reset()
  9208  	ts.hh.Write(b)
  9209  	checksum := hex.EncodeToString(ts.hh.Sum(nil))
  9210  	sum := filepath.Join(dir, JetStreamMetaFileSum)
  9211  	if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil {
  9212  		return err
  9213  	}
  9214  	return nil
  9215  }
  9216  
  9217  func (ts *templateFileStore) Delete(t *streamTemplate) error {
  9218  	return os.RemoveAll(filepath.Join(ts.dir, t.Name))
  9219  }
  9220  
  9221  ////////////////////////////////////////////////////////////////////////////////
  9222  // Compression
  9223  ////////////////////////////////////////////////////////////////////////////////
  9224  
  9225  type CompressionInfo struct {
  9226  	Algorithm    StoreCompression
  9227  	OriginalSize uint64
  9228  }
  9229  
  9230  func (c *CompressionInfo) MarshalMetadata() []byte {
  9231  	b := make([]byte, 14) // 4 + potentially up to 10 for uint64
  9232  	b[0], b[1], b[2] = 'c', 'm', 'p'
  9233  	b[3] = byte(c.Algorithm)
  9234  	n := binary.PutUvarint(b[4:], c.OriginalSize)
  9235  	return b[:4+n]
  9236  }
  9237  
  9238  func (c *CompressionInfo) UnmarshalMetadata(b []byte) (int, error) {
  9239  	c.Algorithm = NoCompression
  9240  	c.OriginalSize = 0
  9241  	if len(b) < 5 { // 4 + min 1 for uvarint uint64
  9242  		return 0, nil
  9243  	}
  9244  	if b[0] != 'c' || b[1] != 'm' || b[2] != 'p' {
  9245  		return 0, nil
  9246  	}
  9247  	var n int
  9248  	c.Algorithm = StoreCompression(b[3])
  9249  	c.OriginalSize, n = binary.Uvarint(b[4:])
  9250  	if n <= 0 {
  9251  		return 0, fmt.Errorf("metadata incomplete")
  9252  	}
  9253  	return 4 + n, nil
  9254  }
  9255  
  9256  func (alg StoreCompression) Compress(buf []byte) ([]byte, error) {
  9257  	if len(buf) < checksumSize {
  9258  		return nil, fmt.Errorf("uncompressed buffer is too short")
  9259  	}
  9260  	bodyLen := int64(len(buf) - checksumSize)
  9261  	var output bytes.Buffer
  9262  	var writer io.WriteCloser
  9263  	switch alg {
  9264  	case NoCompression:
  9265  		return buf, nil
  9266  	case S2Compression:
  9267  		writer = s2.NewWriter(&output)
  9268  	default:
  9269  		return nil, fmt.Errorf("compression algorithm not known")
  9270  	}
  9271  
  9272  	input := bytes.NewReader(buf[:bodyLen])
  9273  	checksum := buf[bodyLen:]
  9274  
  9275  	// Compress the block content, but don't compress the checksum.
  9276  	// We will preserve it at the end of the block as-is.
  9277  	if n, err := io.CopyN(writer, input, bodyLen); err != nil {
  9278  		return nil, fmt.Errorf("error writing to compression writer: %w", err)
  9279  	} else if n != bodyLen {
  9280  		return nil, fmt.Errorf("short write on body (%d != %d)", n, bodyLen)
  9281  	}
  9282  	if err := writer.Close(); err != nil {
  9283  		return nil, fmt.Errorf("error closing compression writer: %w", err)
  9284  	}
  9285  
  9286  	// Now add the checksum back onto the end of the block.
  9287  	if n, err := output.Write(checksum); err != nil {
  9288  		return nil, fmt.Errorf("error writing checksum: %w", err)
  9289  	} else if n != checksumSize {
  9290  		return nil, fmt.Errorf("short write on checksum (%d != %d)", n, checksumSize)
  9291  	}
  9292  
  9293  	return output.Bytes(), nil
  9294  }
  9295  
  9296  func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) {
  9297  	if len(buf) < checksumSize {
  9298  		return nil, fmt.Errorf("compressed buffer is too short")
  9299  	}
  9300  	bodyLen := int64(len(buf) - checksumSize)
  9301  	input := bytes.NewReader(buf[:bodyLen])
  9302  
  9303  	var reader io.ReadCloser
  9304  	switch alg {
  9305  	case NoCompression:
  9306  		return buf, nil
  9307  	case S2Compression:
  9308  		reader = io.NopCloser(s2.NewReader(input))
  9309  	default:
  9310  		return nil, fmt.Errorf("compression algorithm not known")
  9311  	}
  9312  
  9313  	// Decompress the block content. The checksum isn't compressed so
  9314  	// we can preserve it from the end of the block as-is.
  9315  	checksum := buf[bodyLen:]
  9316  	output, err := io.ReadAll(reader)
  9317  	if err != nil {
  9318  		return nil, fmt.Errorf("error reading compression reader: %w", err)
  9319  	}
  9320  	output = append(output, checksum...)
  9321  
  9322  	return output, reader.Close()
  9323  }