get.pme.sh/pnats@v0.0.0-20240304004023-26bb5a137ed0/server/filestore.go (about) 1 // Copyright 2019-2024 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package server 15 16 import ( 17 "archive/tar" 18 "bytes" 19 "crypto/aes" 20 "crypto/cipher" 21 "crypto/rand" 22 "crypto/sha256" 23 "encoding/binary" 24 "encoding/hex" 25 "encoding/json" 26 "errors" 27 "fmt" 28 "hash" 29 "io" 30 "math" 31 "net" 32 "os" 33 "path/filepath" 34 "sort" 35 "strings" 36 "sync" 37 "sync/atomic" 38 "time" 39 40 "get.pme.sh/pnats/server/avl" 41 "get.pme.sh/pnats/server/stree" 42 "github.com/klauspost/compress/s2" 43 "github.com/minio/highwayhash" 44 "golang.org/x/crypto/chacha20" 45 "golang.org/x/crypto/chacha20poly1305" 46 ) 47 48 type FileStoreConfig struct { 49 // Where the parent directory for all storage will be located. 50 StoreDir string 51 // BlockSize is the file block size. This also represents the maximum overhead size. 52 BlockSize uint64 53 // CacheExpire is how long with no activity until we expire the cache. 54 CacheExpire time.Duration 55 // SyncInterval is how often we sync to disk in the background. 56 SyncInterval time.Duration 57 // SyncAlways is when the stream should sync all data writes. 58 SyncAlways bool 59 // AsyncFlush allows async flush to batch write operations. 60 AsyncFlush bool 61 // Cipher is the cipher to use when encrypting. 62 Cipher StoreCipher 63 // Compression is the algorithm to use when compressing. 64 Compression StoreCompression 65 66 // Internal reference to our server. 67 srv *Server 68 } 69 70 // FileStreamInfo allows us to remember created time. 71 type FileStreamInfo struct { 72 Created time.Time 73 StreamConfig 74 } 75 76 type StoreCipher int 77 78 const ( 79 ChaCha StoreCipher = iota 80 AES 81 NoCipher 82 ) 83 84 func (cipher StoreCipher) String() string { 85 switch cipher { 86 case ChaCha: 87 return "ChaCha20-Poly1305" 88 case AES: 89 return "AES-GCM" 90 case NoCipher: 91 return "None" 92 default: 93 return "Unknown StoreCipher" 94 } 95 } 96 97 type StoreCompression uint8 98 99 const ( 100 NoCompression StoreCompression = iota 101 S2Compression 102 ) 103 104 func (alg StoreCompression) String() string { 105 switch alg { 106 case NoCompression: 107 return "None" 108 case S2Compression: 109 return "S2" 110 default: 111 return "Unknown StoreCompression" 112 } 113 } 114 115 func (alg StoreCompression) MarshalJSON() ([]byte, error) { 116 var str string 117 switch alg { 118 case S2Compression: 119 str = "s2" 120 case NoCompression: 121 str = "none" 122 default: 123 return nil, fmt.Errorf("unknown compression algorithm") 124 } 125 return json.Marshal(str) 126 } 127 128 func (alg *StoreCompression) UnmarshalJSON(b []byte) error { 129 var str string 130 if err := json.Unmarshal(b, &str); err != nil { 131 return err 132 } 133 switch str { 134 case "s2": 135 *alg = S2Compression 136 case "none": 137 *alg = NoCompression 138 default: 139 return fmt.Errorf("unknown compression algorithm") 140 } 141 return nil 142 } 143 144 // File ConsumerInfo is used for creating consumer stores. 145 type FileConsumerInfo struct { 146 Created time.Time 147 Name string 148 ConsumerConfig 149 } 150 151 // Default file and directory permissions. 152 const ( 153 defaultDirPerms = os.FileMode(0750) 154 defaultFilePerms = os.FileMode(0640) 155 ) 156 157 type psi struct { 158 total uint64 159 fblk uint32 160 lblk uint32 161 } 162 163 type fileStore struct { 164 srv *Server 165 mu sync.RWMutex 166 state StreamState 167 tombs []uint64 168 ld *LostStreamData 169 scb StorageUpdateHandler 170 ageChk *time.Timer 171 syncTmr *time.Timer 172 cfg FileStreamInfo 173 fcfg FileStoreConfig 174 prf keyGen 175 oldprf keyGen 176 aek cipher.AEAD 177 lmb *msgBlock 178 blks []*msgBlock 179 bim map[uint32]*msgBlock 180 psim *stree.SubjectTree[psi] 181 tsl int 182 adml int 183 hh hash.Hash64 184 qch chan struct{} 185 fch chan struct{} 186 fsld chan struct{} 187 cmu sync.RWMutex 188 cfs []ConsumerStore 189 sips int 190 dirty int 191 closing bool 192 closed bool 193 fip bool 194 receivedAny bool 195 } 196 197 // Represents a message store block and its data. 198 type msgBlock struct { 199 // Here for 32bit systems and atomic. 200 first msgId 201 last msgId 202 mu sync.RWMutex 203 fs *fileStore 204 aek cipher.AEAD 205 bek cipher.Stream 206 seed []byte 207 nonce []byte 208 mfn string 209 mfd *os.File 210 cmp StoreCompression // Effective compression at the time of loading the block 211 liwsz int64 212 index uint32 213 bytes uint64 // User visible bytes count. 214 rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk. 215 msgs uint64 // User visible message count. 216 fss map[string]*SimpleState 217 kfn string 218 lwts int64 219 llts int64 220 lrts int64 221 llseq uint64 222 hh hash.Hash64 223 cache *cache 224 cloads uint64 225 cexp time.Duration 226 ctmr *time.Timer 227 werr error 228 dmap avl.SequenceSet 229 fch chan struct{} 230 qch chan struct{} 231 lchk [8]byte 232 loading bool 233 flusher bool 234 noTrack bool 235 needSync bool 236 syncAlways bool 237 closed bool 238 239 // Used to mock write failures. 240 mockWriteErr bool 241 } 242 243 // Write through caching layer that is also used on loading messages. 244 type cache struct { 245 buf []byte 246 off int 247 wp int 248 idx []uint32 249 lrl uint32 250 fseq uint64 251 nra bool 252 } 253 254 type msgId struct { 255 seq uint64 256 ts int64 257 } 258 259 const ( 260 // Magic is used to identify the file store files. 261 magic = uint8(22) 262 // Version 263 version = uint8(1) 264 // New IndexInfo Version 265 newVersion = uint8(2) 266 // hdrLen 267 hdrLen = 2 268 // This is where we keep the streams. 269 streamsDir = "streams" 270 // This is where we keep the message store blocks. 271 msgDir = "msgs" 272 // This is where we temporarily move the messages dir. 273 purgeDir = "__msgs__" 274 // used to scan blk file names. 275 blkScan = "%d.blk" 276 // used for compacted blocks that are staged. 277 newScan = "%d.new" 278 // used to scan index file names. 279 indexScan = "%d.idx" 280 // used to store our block encryption key. 281 keyScan = "%d.key" 282 // to look for orphans 283 keyScanAll = "*.key" 284 // This is where we keep state on consumers. 285 consumerDir = "obs" 286 // Index file for a consumer. 287 consumerState = "o.dat" 288 // The suffix that will be given to a new temporary block during compression. 289 compressTmpSuffix = ".tmp" 290 // This is where we keep state on templates. 291 tmplsDir = "templates" 292 // Maximum size of a write buffer we may consider for re-use. 293 maxBufReuse = 2 * 1024 * 1024 294 // default cache buffer expiration 295 defaultCacheBufferExpiration = 2 * time.Second 296 // default sync interval 297 defaultSyncInterval = 2 * time.Minute 298 // default idle timeout to close FDs. 299 closeFDsIdle = 30 * time.Second 300 // coalesceMinimum 301 coalesceMinimum = 16 * 1024 302 // maxFlushWait is maximum we will wait to gather messages to flush. 303 maxFlushWait = 8 * time.Millisecond 304 305 // Metafiles for streams and consumers. 306 JetStreamMetaFile = "meta.inf" 307 JetStreamMetaFileSum = "meta.sum" 308 JetStreamMetaFileKey = "meta.key" 309 310 // This is the full snapshotted state for the stream. 311 streamStreamStateFile = "index.db" 312 313 // AEK key sizes 314 minMetaKeySize = 64 315 minBlkKeySize = 64 316 317 // Default stream block size. 318 defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB 319 // Default for workqueue or interest based. 320 defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB 321 // For smaller reuse buffers. Usually being generated during contention on the lead write buffer. 322 // E.g. mirrors/sources etc. 323 defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB 324 // Maximum size for the encrypted head block. 325 maximumEncryptedBlockSize = 2 * 1024 * 1024 // 2MB 326 // Default for KV based 327 defaultKVBlockSize = defaultMediumBlockSize 328 // max block size for now. 329 maxBlockSize = defaultLargeBlockSize 330 // Compact minimum threshold. 331 compactMinimum = 2 * 1024 * 1024 // 2MB 332 // FileStoreMinBlkSize is minimum size we will do for a blk size. 333 FileStoreMinBlkSize = 32 * 1000 // 32kib 334 // FileStoreMaxBlkSize is maximum size we will do for a blk size. 335 FileStoreMaxBlkSize = maxBlockSize 336 // Check for bad record length value due to corrupt data. 337 rlBadThresh = 32 * 1024 * 1024 338 // Checksum size for hash for msg records. 339 recordHashSize = 8 340 ) 341 342 func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) { 343 return newFileStoreWithCreated(fcfg, cfg, time.Now().UTC(), nil, nil) 344 } 345 346 func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created time.Time, prf, oldprf keyGen) (*fileStore, error) { 347 if cfg.Name == _EMPTY_ { 348 return nil, fmt.Errorf("name required") 349 } 350 if cfg.Storage != FileStorage { 351 return nil, fmt.Errorf("fileStore requires file storage type in config") 352 } 353 // Default values. 354 if fcfg.BlockSize == 0 { 355 fcfg.BlockSize = dynBlkSize(cfg.Retention, cfg.MaxBytes, prf != nil) 356 } 357 if fcfg.BlockSize > maxBlockSize { 358 return nil, fmt.Errorf("filestore max block size is %s", friendlyBytes(maxBlockSize)) 359 } 360 if fcfg.CacheExpire == 0 { 361 fcfg.CacheExpire = defaultCacheBufferExpiration 362 } 363 if fcfg.SyncInterval == 0 { 364 fcfg.SyncInterval = defaultSyncInterval 365 } 366 367 // Check the directory 368 if stat, err := os.Stat(fcfg.StoreDir); os.IsNotExist(err) { 369 if err := os.MkdirAll(fcfg.StoreDir, defaultDirPerms); err != nil { 370 return nil, fmt.Errorf("could not create storage directory - %v", err) 371 } 372 } else if stat == nil || !stat.IsDir() { 373 return nil, fmt.Errorf("storage directory is not a directory") 374 } 375 tmpfile, err := os.CreateTemp(fcfg.StoreDir, "_test_") 376 if err != nil { 377 return nil, fmt.Errorf("storage directory is not writable") 378 } 379 380 tmpfile.Close() 381 <-dios 382 os.Remove(tmpfile.Name()) 383 dios <- struct{}{} 384 385 fs := &fileStore{ 386 fcfg: fcfg, 387 psim: stree.NewSubjectTree[psi](), 388 bim: make(map[uint32]*msgBlock), 389 cfg: FileStreamInfo{Created: created, StreamConfig: cfg}, 390 prf: prf, 391 oldprf: oldprf, 392 qch: make(chan struct{}), 393 fch: make(chan struct{}, 1), 394 fsld: make(chan struct{}), 395 srv: fcfg.srv, 396 } 397 398 // Set flush in place to AsyncFlush which by default is false. 399 fs.fip = !fcfg.AsyncFlush 400 401 // Check if this is a new setup. 402 mdir := filepath.Join(fcfg.StoreDir, msgDir) 403 odir := filepath.Join(fcfg.StoreDir, consumerDir) 404 if err := os.MkdirAll(mdir, defaultDirPerms); err != nil { 405 return nil, fmt.Errorf("could not create message storage directory - %v", err) 406 } 407 if err := os.MkdirAll(odir, defaultDirPerms); err != nil { 408 return nil, fmt.Errorf("could not create consumer storage directory - %v", err) 409 } 410 411 // Create highway hash for message blocks. Use sha256 of directory as key. 412 key := sha256.Sum256([]byte(cfg.Name)) 413 fs.hh, err = highwayhash.New64(key[:]) 414 if err != nil { 415 return nil, fmt.Errorf("could not create hash: %v", err) 416 } 417 418 keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey) 419 // Make sure we do not have an encrypted store underneath of us but no main key. 420 if fs.prf == nil { 421 if _, err := os.Stat(keyFile); err == nil { 422 return nil, errNoMainKey 423 } 424 } 425 426 // Attempt to recover our state. 427 err = fs.recoverFullState() 428 if err != nil { 429 // Hold onto state 430 prior := fs.state 431 // Reset anything that could have been set from above. 432 fs.state = StreamState{} 433 fs.psim, fs.tsl = fs.psim.Empty(), 0 434 fs.bim = make(map[uint32]*msgBlock) 435 fs.blks = nil 436 fs.tombs = nil 437 438 // Recover our message state the old way 439 if err := fs.recoverMsgs(); err != nil { 440 return nil, err 441 } 442 443 // Check if our prior state remembers a last sequence past where we can see. 444 if fs.ld != nil && prior.LastSeq > fs.state.LastSeq { 445 fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime 446 if lmb, err := fs.newMsgBlockForWrite(); err == nil { 447 lmb.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano()) 448 } else { 449 return nil, err 450 } 451 } 452 // Since we recovered here, make sure to kick ourselves to write out our stream state. 453 fs.dirty++ 454 defer fs.kickFlushStateLoop() 455 } 456 457 // Also make sure we get rid of old idx and fss files on return. 458 // Do this in separate go routine vs inline and at end of processing. 459 defer func() { 460 go fs.cleanupOldMeta() 461 }() 462 463 // Lock while do enforcements and removals. 464 fs.mu.Lock() 465 466 // Check if we have any left over tombstones to process. 467 if len(fs.tombs) > 0 { 468 for _, seq := range fs.tombs { 469 fs.removeMsg(seq, false, true, false) 470 fs.removeFromLostData(seq) 471 } 472 // Not needed after this phase. 473 fs.tombs = nil 474 } 475 476 // Limits checks and enforcement. 477 fs.enforceMsgLimit() 478 fs.enforceBytesLimit() 479 480 // Do age checks too, make sure to call in place. 481 if fs.cfg.MaxAge != 0 { 482 fs.expireMsgsOnRecover() 483 fs.startAgeChk() 484 } 485 486 // If we have max msgs per subject make sure the is also enforced. 487 if fs.cfg.MaxMsgsPer > 0 { 488 fs.enforceMsgPerSubjectLimit(false) 489 } 490 491 // Grab first sequence for check below while we have lock. 492 firstSeq := fs.state.FirstSeq 493 fs.mu.Unlock() 494 495 // If the stream has an initial sequence number then make sure we 496 // have purged up until that point. We will do this only if the 497 // recovered first sequence number is before our configured first 498 // sequence. Need to do this locked as by now the age check timer 499 // has started. 500 if cfg.FirstSeq > 0 && firstSeq <= cfg.FirstSeq { 501 if _, err := fs.purge(cfg.FirstSeq); err != nil { 502 return nil, err 503 } 504 } 505 506 // Write our meta data if it does not exist or is zero'd out. 507 meta := filepath.Join(fcfg.StoreDir, JetStreamMetaFile) 508 fi, err := os.Stat(meta) 509 if err != nil && os.IsNotExist(err) || fi != nil && fi.Size() == 0 { 510 if err := fs.writeStreamMeta(); err != nil { 511 return nil, err 512 } 513 } 514 515 // If we expect to be encrypted check that what we are restoring is not plaintext. 516 // This can happen on snapshot restores or conversions. 517 if fs.prf != nil { 518 if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) { 519 if err := fs.writeStreamMeta(); err != nil { 520 return nil, err 521 } 522 } 523 } 524 525 // Setup our sync timer. 526 fs.setSyncTimer() 527 528 // Spin up the go routine that will write out or full state stream index. 529 go fs.flushStreamStateLoop(fs.fch, fs.qch, fs.fsld) 530 531 return fs, nil 532 } 533 534 // Lock all existing message blocks. 535 // Lock held on entry. 536 func (fs *fileStore) lockAllMsgBlocks() { 537 for _, mb := range fs.blks { 538 mb.mu.Lock() 539 } 540 } 541 542 // Unlock all existing message blocks. 543 // Lock held on entry. 544 func (fs *fileStore) unlockAllMsgBlocks() { 545 for _, mb := range fs.blks { 546 mb.mu.Unlock() 547 } 548 } 549 550 func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error { 551 if fs.isClosed() { 552 return ErrStoreClosed 553 } 554 if cfg.Name == _EMPTY_ { 555 return fmt.Errorf("name required") 556 } 557 if cfg.Storage != FileStorage { 558 return fmt.Errorf("fileStore requires file storage type in config") 559 } 560 561 fs.mu.Lock() 562 new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg} 563 old_cfg := fs.cfg 564 // The reference story has changed here, so this full msg block lock 565 // may not be needed. 566 fs.lockAllMsgBlocks() 567 fs.cfg = new_cfg 568 fs.unlockAllMsgBlocks() 569 if err := fs.writeStreamMeta(); err != nil { 570 fs.lockAllMsgBlocks() 571 fs.cfg = old_cfg 572 fs.unlockAllMsgBlocks() 573 fs.mu.Unlock() 574 return err 575 } 576 577 // Limits checks and enforcement. 578 fs.enforceMsgLimit() 579 fs.enforceBytesLimit() 580 581 // Do age timers. 582 if fs.ageChk == nil && fs.cfg.MaxAge != 0 { 583 fs.startAgeChk() 584 } 585 if fs.ageChk != nil && fs.cfg.MaxAge == 0 { 586 fs.ageChk.Stop() 587 fs.ageChk = nil 588 } 589 590 if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer { 591 fs.enforceMsgPerSubjectLimit(true) 592 } 593 fs.mu.Unlock() 594 595 if cfg.MaxAge != 0 { 596 fs.expireMsgs() 597 } 598 return nil 599 } 600 601 func dynBlkSize(retention RetentionPolicy, maxBytes int64, encrypted bool) uint64 { 602 if maxBytes > 0 { 603 blkSize := (maxBytes / 4) + 1 // (25% overhead) 604 // Round up to nearest 100 605 if m := blkSize % 100; m != 0 { 606 blkSize += 100 - m 607 } 608 if blkSize <= FileStoreMinBlkSize { 609 blkSize = FileStoreMinBlkSize 610 } else if blkSize >= FileStoreMaxBlkSize { 611 blkSize = FileStoreMaxBlkSize 612 } else { 613 blkSize = defaultMediumBlockSize 614 } 615 if encrypted && blkSize > maximumEncryptedBlockSize { 616 // Notes on this below. 617 blkSize = maximumEncryptedBlockSize 618 } 619 return uint64(blkSize) 620 } 621 622 switch { 623 case encrypted: 624 // In the case of encrypted stores, large blocks can result in worsened perf 625 // since many writes on disk involve re-encrypting the entire block. For now, 626 // we will enforce a cap on the block size when encryption is enabled to avoid 627 // this. 628 return maximumEncryptedBlockSize 629 case retention == LimitsPolicy: 630 // TODO(dlc) - Make the blocksize relative to this if set. 631 return defaultLargeBlockSize 632 default: 633 // TODO(dlc) - Make the blocksize relative to this if set. 634 return defaultMediumBlockSize 635 } 636 } 637 638 func genEncryptionKey(sc StoreCipher, seed []byte) (ek cipher.AEAD, err error) { 639 if sc == ChaCha { 640 ek, err = chacha20poly1305.NewX(seed) 641 } else if sc == AES { 642 block, e := aes.NewCipher(seed) 643 if e != nil { 644 return nil, err 645 } 646 ek, err = cipher.NewGCMWithNonceSize(block, block.BlockSize()) 647 } else { 648 err = errUnknownCipher 649 } 650 return ek, err 651 } 652 653 // Generate an asset encryption key from the context and server PRF. 654 func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cipher.Stream, seed, encrypted []byte, err error) { 655 if fs.prf == nil { 656 return nil, nil, nil, nil, errNoEncryption 657 } 658 // Generate key encryption key. 659 rb, err := fs.prf([]byte(context)) 660 if err != nil { 661 return nil, nil, nil, nil, err 662 } 663 664 sc := fs.fcfg.Cipher 665 666 kek, err := genEncryptionKey(sc, rb) 667 if err != nil { 668 return nil, nil, nil, nil, err 669 } 670 // Generate random asset encryption key seed. 671 672 const seedSize = 32 673 seed = make([]byte, seedSize) 674 if n, err := rand.Read(seed); err != nil || n != seedSize { 675 return nil, nil, nil, nil, err 676 } 677 678 aek, err = genEncryptionKey(sc, seed) 679 if err != nil { 680 return nil, nil, nil, nil, err 681 } 682 683 // Generate our nonce. Use same buffer to hold encrypted seed. 684 nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead()) 685 rand.Read(nonce) 686 687 bek, err = genBlockEncryptionKey(sc, seed[:], nonce) 688 if err != nil { 689 return nil, nil, nil, nil, err 690 } 691 692 return aek, bek, seed, kek.Seal(nonce, nonce, seed, nil), nil 693 } 694 695 // Will generate the block encryption key. 696 func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, error) { 697 if sc == ChaCha { 698 return chacha20.NewUnauthenticatedCipher(seed, nonce) 699 } else if sc == AES { 700 block, err := aes.NewCipher(seed) 701 if err != nil { 702 return nil, err 703 } 704 return cipher.NewCTR(block, nonce), nil 705 } 706 return nil, errUnknownCipher 707 } 708 709 // Lock should be held. 710 func (fs *fileStore) recoverAEK() error { 711 if fs.prf != nil && fs.aek == nil { 712 ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)) 713 if err != nil { 714 return err 715 } 716 rb, err := fs.prf([]byte(fs.cfg.Name)) 717 if err != nil { 718 return err 719 } 720 kek, err := genEncryptionKey(fs.fcfg.Cipher, rb) 721 if err != nil { 722 return err 723 } 724 ns := kek.NonceSize() 725 seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) 726 if err != nil { 727 return err 728 } 729 aek, err := genEncryptionKey(fs.fcfg.Cipher, seed) 730 if err != nil { 731 return err 732 } 733 fs.aek = aek 734 } 735 return nil 736 } 737 738 // Lock should be held. 739 func (fs *fileStore) setupAEK() error { 740 if fs.prf != nil && fs.aek == nil { 741 key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name) 742 if err != nil { 743 return err 744 } 745 keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey) 746 if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) { 747 return err 748 } 749 <-dios 750 err = os.WriteFile(keyFile, encrypted, defaultFilePerms) 751 dios <- struct{}{} 752 if err != nil { 753 return err 754 } 755 // Set our aek. 756 fs.aek = key 757 } 758 return nil 759 } 760 761 // Write out meta and the checksum. 762 // Lock should be held. 763 func (fs *fileStore) writeStreamMeta() error { 764 if err := fs.setupAEK(); err != nil { 765 return err 766 } 767 768 meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile) 769 if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) { 770 return err 771 } 772 b, err := json.Marshal(fs.cfg) 773 if err != nil { 774 return err 775 } 776 // Encrypt if needed. 777 if fs.aek != nil { 778 nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead()) 779 rand.Read(nonce) 780 b = fs.aek.Seal(nonce, nonce, b, nil) 781 } 782 783 <-dios 784 err = os.WriteFile(meta, b, defaultFilePerms) 785 dios <- struct{}{} 786 if err != nil { 787 return err 788 } 789 fs.hh.Reset() 790 fs.hh.Write(b) 791 checksum := hex.EncodeToString(fs.hh.Sum(nil)) 792 sum := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileSum) 793 <-dios 794 err = os.WriteFile(sum, []byte(checksum), defaultFilePerms) 795 dios <- struct{}{} 796 if err != nil { 797 return err 798 } 799 return nil 800 } 801 802 // Pools to recycle the blocks to help with memory pressure. 803 var blkPoolBig sync.Pool // 16MB 804 var blkPoolMedium sync.Pool // 8MB 805 var blkPoolSmall sync.Pool // 2MB 806 807 // Get a new msg block based on sz estimate. 808 func getMsgBlockBuf(sz int) (buf []byte) { 809 var pb any 810 if sz <= defaultSmallBlockSize { 811 pb = blkPoolSmall.Get() 812 } else if sz <= defaultMediumBlockSize { 813 pb = blkPoolMedium.Get() 814 } else { 815 pb = blkPoolBig.Get() 816 } 817 if pb != nil { 818 buf = *(pb.(*[]byte)) 819 } else { 820 // Here we need to make a new blk. 821 // If small leave as is.. 822 if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize { 823 sz = defaultMediumBlockSize 824 } else if sz > defaultMediumBlockSize { 825 sz = defaultLargeBlockSize 826 } 827 buf = make([]byte, sz) 828 } 829 return buf[:0] 830 } 831 832 // Recycle the msg block. 833 func recycleMsgBlockBuf(buf []byte) { 834 if buf == nil || cap(buf) < defaultSmallBlockSize { 835 return 836 } 837 // Make sure to reset before placing back into pool. 838 buf = buf[:0] 839 840 // We need to make sure the load code gets a block that can fit the maximum for a size block. 841 // E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting 842 // it right back in and making a new []byte. 843 // From above we know its already >= defaultSmallBlockSize 844 if sz := cap(buf); sz < defaultMediumBlockSize { 845 blkPoolSmall.Put(&buf) 846 } else if sz < defaultLargeBlockSize { 847 blkPoolMedium.Put(&buf) 848 } else { 849 blkPoolBig.Put(&buf) 850 } 851 } 852 853 const ( 854 msgHdrSize = 22 855 checksumSize = 8 856 emptyRecordLen = msgHdrSize + checksumSize 857 ) 858 859 // Lock should be held. 860 func (fs *fileStore) noTrackSubjects() bool { 861 return !(fs.psim.Size() > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0) 862 } 863 864 // Will init the basics for a message block. 865 func (fs *fileStore) initMsgBlock(index uint32) *msgBlock { 866 mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways} 867 868 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 869 mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index)) 870 871 if mb.hh == nil { 872 key := sha256.Sum256(fs.hashKeyForBlock(index)) 873 mb.hh, _ = highwayhash.New64(key[:]) 874 } 875 return mb 876 } 877 878 // Lock for fs should be held. 879 func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error { 880 if fs.prf == nil { 881 return nil 882 } 883 884 var createdKeys bool 885 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 886 ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))) 887 if err != nil { 888 // We do not seem to have keys even though we should. Could be a plaintext conversion. 889 // Create the keys and we will double check below. 890 if err := fs.genEncryptionKeysForBlock(mb); err != nil { 891 return err 892 } 893 createdKeys = true 894 } else { 895 if len(ekey) < minBlkKeySize { 896 return errBadKeySize 897 } 898 // Recover key encryption key. 899 rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))) 900 if err != nil { 901 return err 902 } 903 904 sc := fs.fcfg.Cipher 905 kek, err := genEncryptionKey(sc, rb) 906 if err != nil { 907 return err 908 } 909 ns := kek.NonceSize() 910 seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) 911 if err != nil { 912 // We may be here on a cipher conversion, so attempt to convert. 913 if err = mb.convertCipher(); err != nil { 914 return err 915 } 916 } else { 917 mb.seed, mb.nonce = seed, ekey[:ns] 918 } 919 mb.aek, err = genEncryptionKey(sc, mb.seed) 920 if err != nil { 921 return err 922 } 923 if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil { 924 return err 925 } 926 } 927 928 // If we created keys here, let's check the data and if it is plaintext convert here. 929 if createdKeys { 930 if err := mb.convertToEncrypted(); err != nil { 931 return err 932 } 933 } 934 935 return nil 936 } 937 938 // Load a last checksum if needed from the block file. 939 // Lock should be held. 940 func (mb *msgBlock) ensureLastChecksumLoaded() { 941 var empty [8]byte 942 if mb.lchk != empty { 943 return 944 } 945 copy(mb.lchk[0:], mb.lastChecksum()) 946 } 947 948 // Perform a recover but do not update PSIM. 949 // Lock should be held. 950 func (fs *fileStore) recoverMsgBlockNoSubjectUpdates(index uint32) (*msgBlock, error) { 951 psim, tsl := fs.psim, fs.tsl 952 fs.psim = nil 953 mb, err := fs.recoverMsgBlock(index) 954 fs.psim, fs.tsl = psim, tsl 955 return mb, err 956 } 957 958 // Lock held on entry 959 func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { 960 mb := fs.initMsgBlock(index) 961 962 // Open up the message file, but we will try to recover from the index file. 963 // We will check that the last checksums match. 964 file, err := mb.openBlock() 965 if err != nil { 966 return nil, err 967 } 968 defer file.Close() 969 970 if fi, err := file.Stat(); fi != nil { 971 mb.rbytes = uint64(fi.Size()) 972 } else { 973 return nil, err 974 } 975 976 // Make sure encryption loaded if needed. 977 fs.loadEncryptionForMsgBlock(mb) 978 979 // Grab last checksum from main block file. 980 var lchk [8]byte 981 if mb.rbytes >= checksumSize { 982 if mb.bek != nil { 983 if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize { 984 mb.bek.XORKeyStream(buf, buf) 985 copy(lchk[0:], buf[len(buf)-checksumSize:]) 986 } 987 } else { 988 file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) 989 } 990 } 991 992 file.Close() 993 994 // Read our index file. Use this as source of truth if possible. 995 if err := mb.readIndexInfo(); err == nil { 996 // Quick sanity check here. 997 // Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty. 998 if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) { 999 if mb.msgs > 0 && !mb.noTrack && fs.psim != nil { 1000 fs.populateGlobalPerSubjectInfo(mb) 1001 // Try to dump any state we needed on recovery. 1002 mb.tryForceExpireCacheLocked() 1003 } 1004 fs.addMsgBlock(mb) 1005 return mb, nil 1006 } 1007 } 1008 1009 // If we get data loss rebuilding the message block state record that with the fs itself. 1010 ld, tombs, _ := mb.rebuildState() 1011 if ld != nil { 1012 fs.addLostData(ld) 1013 } 1014 // Collect all tombstones. 1015 if len(tombs) > 0 { 1016 fs.tombs = append(fs.tombs, tombs...) 1017 } 1018 1019 if mb.msgs > 0 && !mb.noTrack && fs.psim != nil { 1020 fs.populateGlobalPerSubjectInfo(mb) 1021 // Try to dump any state we needed on recovery. 1022 mb.tryForceExpireCacheLocked() 1023 } 1024 1025 mb.closeFDs() 1026 fs.addMsgBlock(mb) 1027 1028 return mb, nil 1029 } 1030 1031 func (fs *fileStore) lostData() *LostStreamData { 1032 fs.mu.RLock() 1033 defer fs.mu.RUnlock() 1034 if fs.ld == nil { 1035 return nil 1036 } 1037 nld := *fs.ld 1038 return &nld 1039 } 1040 1041 // Lock should be held. 1042 func (fs *fileStore) addLostData(ld *LostStreamData) { 1043 if ld == nil { 1044 return 1045 } 1046 if fs.ld != nil { 1047 var added bool 1048 for _, seq := range ld.Msgs { 1049 if _, found := fs.ld.exists(seq); !found { 1050 fs.ld.Msgs = append(fs.ld.Msgs, seq) 1051 added = true 1052 } 1053 } 1054 if added { 1055 msgs := fs.ld.Msgs 1056 sort.Slice(msgs, func(i, j int) bool { return msgs[i] < msgs[j] }) 1057 fs.ld.Bytes += ld.Bytes 1058 } 1059 } else { 1060 fs.ld = ld 1061 } 1062 } 1063 1064 // Helper to see if we already have this sequence reported in our lost data. 1065 func (ld *LostStreamData) exists(seq uint64) (int, bool) { 1066 i, found := sort.Find(len(ld.Msgs), func(i int) int { 1067 tseq := ld.Msgs[i] 1068 if tseq < seq { 1069 return -1 1070 } 1071 if tseq > seq { 1072 return +1 1073 } 1074 return 0 1075 }) 1076 return i, found 1077 } 1078 1079 func (fs *fileStore) removeFromLostData(seq uint64) { 1080 if fs.ld == nil { 1081 return 1082 } 1083 if i, found := fs.ld.exists(seq); found { 1084 fs.ld.Msgs = append(fs.ld.Msgs[:i], fs.ld.Msgs[i+1:]...) 1085 if len(fs.ld.Msgs) == 0 { 1086 fs.ld = nil 1087 } 1088 } 1089 } 1090 1091 func (fs *fileStore) rebuildState(ld *LostStreamData) { 1092 fs.mu.Lock() 1093 defer fs.mu.Unlock() 1094 fs.rebuildStateLocked(ld) 1095 } 1096 1097 // Lock should be held. 1098 func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) { 1099 fs.addLostData(ld) 1100 1101 fs.state.Msgs, fs.state.Bytes = 0, 0 1102 fs.state.FirstSeq, fs.state.LastSeq = 0, 0 1103 1104 for _, mb := range fs.blks { 1105 mb.mu.RLock() 1106 fs.state.Msgs += mb.msgs 1107 fs.state.Bytes += mb.bytes 1108 fseq := atomic.LoadUint64(&mb.first.seq) 1109 if fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { 1110 fs.state.FirstSeq = fseq 1111 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 1112 } 1113 fs.state.LastSeq = atomic.LoadUint64(&mb.last.seq) 1114 fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() 1115 mb.mu.RUnlock() 1116 } 1117 } 1118 1119 // Attempt to convert the cipher used for this message block. 1120 func (mb *msgBlock) convertCipher() error { 1121 fs := mb.fs 1122 sc := fs.fcfg.Cipher 1123 1124 var osc StoreCipher 1125 switch sc { 1126 case ChaCha: 1127 osc = AES 1128 case AES: 1129 osc = ChaCha 1130 } 1131 1132 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 1133 ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))) 1134 if err != nil { 1135 return err 1136 } 1137 if len(ekey) < minBlkKeySize { 1138 return errBadKeySize 1139 } 1140 type prfWithCipher struct { 1141 keyGen 1142 StoreCipher 1143 } 1144 var prfs []prfWithCipher 1145 if fs.prf != nil { 1146 prfs = append(prfs, prfWithCipher{fs.prf, sc}) 1147 prfs = append(prfs, prfWithCipher{fs.prf, osc}) 1148 } 1149 if fs.oldprf != nil { 1150 prfs = append(prfs, prfWithCipher{fs.oldprf, sc}) 1151 prfs = append(prfs, prfWithCipher{fs.oldprf, osc}) 1152 } 1153 1154 for _, prf := range prfs { 1155 // Recover key encryption key. 1156 rb, err := prf.keyGen([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))) 1157 if err != nil { 1158 continue 1159 } 1160 kek, err := genEncryptionKey(prf.StoreCipher, rb) 1161 if err != nil { 1162 continue 1163 } 1164 ns := kek.NonceSize() 1165 seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) 1166 if err != nil { 1167 continue 1168 } 1169 nonce := ekey[:ns] 1170 bek, err := genBlockEncryptionKey(prf.StoreCipher, seed, nonce) 1171 if err != nil { 1172 return err 1173 } 1174 1175 buf, _ := mb.loadBlock(nil) 1176 bek.XORKeyStream(buf, buf) 1177 // Make sure we can parse with old cipher and key file. 1178 if err = mb.indexCacheBuf(buf); err != nil { 1179 return err 1180 } 1181 // Reset the cache since we just read everything in. 1182 mb.cache = nil 1183 1184 // Generate new keys. If we error for some reason then we will put 1185 // the old keyfile back. 1186 if err := fs.genEncryptionKeysForBlock(mb); err != nil { 1187 keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)) 1188 <-dios 1189 os.WriteFile(keyFile, ekey, defaultFilePerms) 1190 dios <- struct{}{} 1191 return err 1192 } 1193 mb.bek.XORKeyStream(buf, buf) 1194 <-dios 1195 err = os.WriteFile(mb.mfn, buf, defaultFilePerms) 1196 dios <- struct{}{} 1197 if err != nil { 1198 return err 1199 } 1200 return nil 1201 } 1202 return fmt.Errorf("unable to recover keys") 1203 } 1204 1205 // Convert a plaintext block to encrypted. 1206 func (mb *msgBlock) convertToEncrypted() error { 1207 if mb.bek == nil { 1208 return nil 1209 } 1210 buf, err := mb.loadBlock(nil) 1211 if err != nil { 1212 return err 1213 } 1214 if err := mb.indexCacheBuf(buf); err != nil { 1215 // This likely indicates this was already encrypted or corrupt. 1216 mb.cache = nil 1217 return err 1218 } 1219 // Undo cache from above for later. 1220 mb.cache = nil 1221 mb.bek.XORKeyStream(buf, buf) 1222 <-dios 1223 err = os.WriteFile(mb.mfn, buf, defaultFilePerms) 1224 dios <- struct{}{} 1225 if err != nil { 1226 return err 1227 } 1228 return nil 1229 } 1230 1231 // Rebuild the state of the blk based on what we have on disk in the N.blk file. 1232 // We will return any lost data, and we will return any delete tombstones we encountered. 1233 func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) { 1234 mb.mu.Lock() 1235 defer mb.mu.Unlock() 1236 return mb.rebuildStateLocked() 1237 } 1238 1239 // Rebuild the state of the blk based on what we have on disk in the N.blk file. 1240 // Lock should be held. 1241 func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { 1242 startLastSeq := atomic.LoadUint64(&mb.last.seq) 1243 1244 // Remove the .fss file and clear any cache we have set. 1245 mb.clearCacheAndOffset() 1246 1247 buf, err := mb.loadBlock(nil) 1248 defer recycleMsgBlockBuf(buf) 1249 1250 if err != nil || len(buf) == 0 { 1251 var ld *LostStreamData 1252 // No data to rebuild from here. 1253 if mb.msgs > 0 { 1254 // We need to declare lost data here. 1255 ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes} 1256 firstSeq, lastSeq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 1257 for seq := firstSeq; seq <= lastSeq; seq++ { 1258 if !mb.dmap.Exists(seq) { 1259 ld.Msgs = append(ld.Msgs, seq) 1260 } 1261 } 1262 // Clear invalid state. We will let this blk be added in here. 1263 mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil 1264 mb.dmap.Empty() 1265 atomic.StoreUint64(&mb.first.seq, atomic.LoadUint64(&mb.last.seq)+1) 1266 } 1267 return ld, nil, err 1268 } 1269 1270 // Clear state we need to rebuild. 1271 mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil 1272 atomic.StoreUint64(&mb.last.seq, 0) 1273 mb.last.ts = 0 1274 firstNeedsSet := true 1275 1276 // Check if we need to decrypt. 1277 if mb.bek != nil && len(buf) > 0 { 1278 // Recreate to reset counter. 1279 mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 1280 if err != nil { 1281 return nil, nil, err 1282 } 1283 mb.bek.XORKeyStream(buf, buf) 1284 } 1285 1286 // Check for compression. 1287 if buf, err = mb.decompressIfNeeded(buf); err != nil { 1288 return nil, nil, err 1289 } 1290 1291 mb.rbytes = uint64(len(buf)) 1292 1293 addToDmap := func(seq uint64) { 1294 if seq == 0 { 1295 return 1296 } 1297 mb.dmap.Insert(seq) 1298 } 1299 1300 var le = binary.LittleEndian 1301 1302 truncate := func(index uint32) { 1303 var fd *os.File 1304 if mb.mfd != nil { 1305 fd = mb.mfd 1306 } else { 1307 <-dios 1308 fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms) 1309 dios <- struct{}{} 1310 if err == nil { 1311 defer fd.Close() 1312 } 1313 } 1314 if fd == nil { 1315 return 1316 } 1317 if err := fd.Truncate(int64(index)); err == nil { 1318 // Update our checksum. 1319 if index >= 8 { 1320 var lchk [8]byte 1321 fd.ReadAt(lchk[:], int64(index-8)) 1322 copy(mb.lchk[0:], lchk[:]) 1323 } 1324 fd.Sync() 1325 } 1326 } 1327 1328 gatherLost := func(lb uint32) *LostStreamData { 1329 var ld LostStreamData 1330 for seq := atomic.LoadUint64(&mb.last.seq) + 1; seq <= startLastSeq; seq++ { 1331 ld.Msgs = append(ld.Msgs, seq) 1332 } 1333 ld.Bytes = uint64(lb) 1334 return &ld 1335 } 1336 1337 // For tombstones that we find and collect. 1338 var ( 1339 tombstones []uint64 1340 minTombstoneSeq uint64 1341 minTombstoneTs int64 1342 ) 1343 1344 for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { 1345 if index+msgHdrSize > lbuf { 1346 truncate(index) 1347 return gatherLost(lbuf - index), tombstones, nil 1348 } 1349 1350 hdr := buf[index : index+msgHdrSize] 1351 rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:]) 1352 1353 hasHeaders := rl&hbit != 0 1354 // Clear any headers bit that could be set. 1355 rl &^= hbit 1356 dlen := int(rl) - msgHdrSize 1357 // Do some quick sanity checks here. 1358 if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { 1359 truncate(index) 1360 return gatherLost(lbuf - index), tombstones, errBadMsg 1361 } 1362 1363 // Check for checksum failures before additional processing. 1364 data := buf[index+msgHdrSize : index+rl] 1365 if hh := mb.hh; hh != nil { 1366 hh.Reset() 1367 hh.Write(hdr[4:20]) 1368 hh.Write(data[:slen]) 1369 if hasHeaders { 1370 hh.Write(data[slen+4 : dlen-recordHashSize]) 1371 } else { 1372 hh.Write(data[slen : dlen-recordHashSize]) 1373 } 1374 checksum := hh.Sum(nil) 1375 if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) { 1376 truncate(index) 1377 return gatherLost(lbuf - index), tombstones, errBadMsg 1378 } 1379 copy(mb.lchk[0:], checksum) 1380 } 1381 1382 // Grab our sequence and timestamp. 1383 seq := le.Uint64(hdr[4:]) 1384 ts := int64(le.Uint64(hdr[12:])) 1385 1386 // Check if this is a delete tombstone. 1387 if seq&tbit != 0 { 1388 seq = seq &^ tbit 1389 // Need to process this here and make sure we have accounted for this properly. 1390 tombstones = append(tombstones, seq) 1391 if minTombstoneSeq == 0 || seq < minTombstoneSeq { 1392 minTombstoneSeq, minTombstoneTs = seq, ts 1393 } 1394 index += rl 1395 continue 1396 } 1397 1398 fseq := atomic.LoadUint64(&mb.first.seq) 1399 // This is an old erased message, or a new one that we can track. 1400 if seq == 0 || seq&ebit != 0 || seq < fseq { 1401 seq = seq &^ ebit 1402 if seq >= fseq { 1403 // Only add to dmap if past recorded first seq and non-zero. 1404 if seq != 0 { 1405 addToDmap(seq) 1406 } 1407 atomic.StoreUint64(&mb.last.seq, seq) 1408 mb.last.ts = ts 1409 if mb.msgs == 0 { 1410 atomic.StoreUint64(&mb.first.seq, seq+1) 1411 mb.first.ts = 0 1412 } 1413 } 1414 index += rl 1415 continue 1416 } 1417 1418 // This is for when we have index info that adjusts for deleted messages 1419 // at the head. So the first.seq will be already set here. If this is larger 1420 // replace what we have with this seq. 1421 if firstNeedsSet && seq >= fseq { 1422 atomic.StoreUint64(&mb.first.seq, seq) 1423 firstNeedsSet, mb.first.ts = false, ts 1424 } 1425 1426 if !mb.dmap.Exists(seq) { 1427 mb.msgs++ 1428 mb.bytes += uint64(rl) 1429 } 1430 1431 // Always set last 1432 atomic.StoreUint64(&mb.last.seq, seq) 1433 mb.last.ts = ts 1434 1435 // Advance to next record. 1436 index += rl 1437 } 1438 1439 // For empty msg blocks make sure we recover last seq correctly based off of first. 1440 // Or if we seem to have no messages but had a tombstone, which we use to remember 1441 // sequences and timestamps now, use that to properly setup the first and last. 1442 if mb.msgs == 0 { 1443 fseq := atomic.LoadUint64(&mb.first.seq) 1444 if fseq > 0 { 1445 atomic.StoreUint64(&mb.last.seq, fseq-1) 1446 } else if fseq == 0 && minTombstoneSeq > 0 { 1447 atomic.StoreUint64(&mb.first.seq, minTombstoneSeq+1) 1448 mb.first.ts = 0 1449 if mb.last.seq == 0 { 1450 atomic.StoreUint64(&mb.last.seq, minTombstoneSeq) 1451 mb.last.ts = minTombstoneTs 1452 } 1453 } 1454 } 1455 1456 return nil, tombstones, nil 1457 } 1458 1459 // For doing warn logging. 1460 // Lock should be held. 1461 func (fs *fileStore) warn(format string, args ...any) { 1462 // No-op if no server configured. 1463 if fs.srv == nil { 1464 return 1465 } 1466 fs.srv.Warnf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...) 1467 } 1468 1469 // For doing debug logging. 1470 // Lock should be held. 1471 func (fs *fileStore) debug(format string, args ...any) { 1472 // No-op if no server configured. 1473 if fs.srv == nil { 1474 return 1475 } 1476 fs.srv.Debugf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...) 1477 } 1478 1479 // Track local state but ignore timestamps here. 1480 func updateTrackingState(state *StreamState, mb *msgBlock) { 1481 if state.FirstSeq == 0 { 1482 state.FirstSeq = mb.first.seq 1483 } else if mb.first.seq < state.FirstSeq { 1484 state.FirstSeq = mb.first.seq 1485 } 1486 if mb.last.seq > state.LastSeq { 1487 state.LastSeq = mb.last.seq 1488 } 1489 state.Msgs += mb.msgs 1490 state.Bytes += mb.bytes 1491 } 1492 1493 // Determine if our tracking states are the same. 1494 func trackingStatesEqual(fs, mb *StreamState) bool { 1495 // When a fs is brand new the fs state will have first seq of 0, but tracking mb may have 1. 1496 // If either has a first sequence that is not 0 or 1 we will check if they are the same, otherwise skip. 1497 if fs.FirstSeq > 1 || mb.FirstSeq > 1 { 1498 return fs.Msgs == mb.Msgs && fs.FirstSeq == mb.FirstSeq && fs.LastSeq == mb.LastSeq && fs.Bytes == mb.Bytes 1499 } 1500 return fs.Msgs == mb.Msgs && fs.LastSeq == mb.LastSeq && fs.Bytes == mb.Bytes 1501 } 1502 1503 // recoverFullState will attempt to receover our last full state and re-process any state changes 1504 // that happened afterwards. 1505 func (fs *fileStore) recoverFullState() (rerr error) { 1506 fs.mu.Lock() 1507 defer fs.mu.Unlock() 1508 1509 // Check for any left over purged messages. 1510 <-dios 1511 pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) 1512 if _, err := os.Stat(pdir); err == nil { 1513 os.RemoveAll(pdir) 1514 } 1515 // Grab our stream state file and load it in. 1516 fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) 1517 buf, err := os.ReadFile(fn) 1518 dios <- struct{}{} 1519 1520 if err != nil { 1521 if !os.IsNotExist(err) { 1522 fs.warn("Could not read stream state file: %v", err) 1523 } 1524 return err 1525 } 1526 1527 const minLen = 32 1528 if len(buf) < minLen { 1529 os.Remove(fn) 1530 fs.warn("Stream state too short (%d bytes)", len(buf)) 1531 return errCorruptState 1532 } 1533 1534 // The highwayhash will be on the end. Check that it still matches. 1535 h := buf[len(buf)-highwayhash.Size64:] 1536 buf = buf[:len(buf)-highwayhash.Size64] 1537 fs.hh.Reset() 1538 fs.hh.Write(buf) 1539 if !bytes.Equal(h, fs.hh.Sum(nil)) { 1540 os.Remove(fn) 1541 fs.warn("Stream state checksum did not match") 1542 return errCorruptState 1543 } 1544 1545 // Decrypt if needed. 1546 if fs.prf != nil { 1547 // We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile 1548 // since snapshots strip encryption. 1549 if err := fs.recoverAEK(); err == nil { 1550 ns := fs.aek.NonceSize() 1551 buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil) 1552 if err != nil { 1553 fs.warn("Stream state error reading encryption key: %v", err) 1554 return err 1555 } 1556 } 1557 } 1558 1559 if buf[0] != fullStateMagic || buf[1] != fullStateVersion { 1560 os.Remove(fn) 1561 fs.warn("Stream state magic and version mismatch") 1562 return errCorruptState 1563 } 1564 1565 bi := hdrLen 1566 1567 readU64 := func() uint64 { 1568 if bi < 0 { 1569 return 0 1570 } 1571 v, n := binary.Uvarint(buf[bi:]) 1572 if n <= 0 { 1573 bi = -1 1574 return 0 1575 } 1576 bi += n 1577 return v 1578 } 1579 readI64 := func() int64 { 1580 if bi < 0 { 1581 return 0 1582 } 1583 v, n := binary.Varint(buf[bi:]) 1584 if n <= 0 { 1585 bi = -1 1586 return -1 1587 } 1588 bi += n 1589 return v 1590 } 1591 1592 setTime := func(t *time.Time, ts int64) { 1593 if ts == 0 { 1594 *t = time.Time{} 1595 } else { 1596 *t = time.Unix(0, ts).UTC() 1597 } 1598 } 1599 1600 var state StreamState 1601 state.Msgs = readU64() 1602 state.Bytes = readU64() 1603 state.FirstSeq = readU64() 1604 baseTime := readI64() 1605 setTime(&state.FirstTime, baseTime) 1606 state.LastSeq = readU64() 1607 setTime(&state.LastTime, readI64()) 1608 1609 // Check for per subject info. 1610 if numSubjects := int(readU64()); numSubjects > 0 { 1611 fs.psim, fs.tsl = fs.psim.Empty(), 0 1612 for i := 0; i < numSubjects; i++ { 1613 if lsubj := int(readU64()); lsubj > 0 { 1614 if bi+lsubj > len(buf) { 1615 os.Remove(fn) 1616 fs.warn("Stream state bad subject len (%d)", lsubj) 1617 return errCorruptState 1618 } 1619 // If we have lots of subjects this will alloc for each one. 1620 // We could reference the underlying buffer, but we could guess wrong if 1621 // number of blocks is large and subjects is low, since we would reference buf. 1622 subj := buf[bi : bi+lsubj] 1623 // We had a bug that could cause memory corruption in the PSIM that could have gotten stored to disk. 1624 // Only would affect subjects, so do quick check. 1625 if !isValidSubject(string(subj), true) { 1626 os.Remove(fn) 1627 fs.warn("Stream state corrupt subject detected") 1628 return errCorruptState 1629 } 1630 bi += lsubj 1631 psi := psi{total: readU64(), fblk: uint32(readU64())} 1632 if psi.total > 1 { 1633 psi.lblk = uint32(readU64()) 1634 } else { 1635 psi.lblk = psi.fblk 1636 } 1637 fs.psim.Insert(subj, psi) 1638 fs.tsl += lsubj 1639 } 1640 } 1641 } 1642 1643 // Track the state as represented by the blocks themselves. 1644 var mstate StreamState 1645 1646 if numBlocks := readU64(); numBlocks > 0 { 1647 lastIndex := int(numBlocks - 1) 1648 fs.blks = make([]*msgBlock, 0, numBlocks) 1649 for i := 0; i < int(numBlocks); i++ { 1650 index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64() 1651 if bi < 0 { 1652 break 1653 } 1654 mb := fs.initMsgBlock(index) 1655 atomic.StoreUint64(&mb.first.seq, fseq) 1656 atomic.StoreUint64(&mb.last.seq, lseq) 1657 mb.msgs, mb.bytes = lseq-fseq+1, nbytes 1658 mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime 1659 if numDeleted > 0 { 1660 dmap, n, err := avl.Decode(buf[bi:]) 1661 if err != nil { 1662 os.Remove(fn) 1663 fs.warn("Stream state error decoding avl dmap: %v", err) 1664 return errCorruptState 1665 } 1666 mb.dmap = *dmap 1667 if mb.msgs > numDeleted { 1668 mb.msgs -= numDeleted 1669 } else { 1670 mb.msgs = 0 1671 } 1672 bi += n 1673 } 1674 // Only add in if not empty or the lmb. 1675 if mb.msgs > 0 || i == lastIndex { 1676 fs.addMsgBlock(mb) 1677 updateTrackingState(&mstate, mb) 1678 } else { 1679 // Mark dirty to cleanup. 1680 fs.dirty++ 1681 } 1682 } 1683 } 1684 1685 // Pull in last block index for the block that had last checksum when we wrote the full state. 1686 blkIndex := uint32(readU64()) 1687 var lchk [8]byte 1688 if bi+len(lchk) > len(buf) { 1689 bi = -1 1690 } else { 1691 copy(lchk[0:], buf[bi:bi+len(lchk)]) 1692 } 1693 1694 // Check if we had any errors. 1695 if bi < 0 { 1696 os.Remove(fn) 1697 fs.warn("Stream state has no checksum present") 1698 return errCorruptState 1699 } 1700 1701 // Move into place our state, msgBlks and subject info. 1702 fs.state = state 1703 1704 // First let's check the happy path, open the blk file that was the lmb when we created the full state. 1705 // See if we have the last block available. 1706 var matched bool 1707 mb := fs.lmb 1708 if mb == nil || mb.index != blkIndex { 1709 fs.warn("Stream state block does not exist or index mismatch") 1710 return errCorruptState 1711 } 1712 if _, err := os.Stat(mb.mfn); err != nil && os.IsNotExist(err) { 1713 // If our saved state is past what we see on disk, fallback and rebuild. 1714 if ld, _, _ := mb.rebuildState(); ld != nil { 1715 fs.addLostData(ld) 1716 } 1717 fs.warn("Stream state detected prior state, could not locate msg block %d", blkIndex) 1718 return errPriorState 1719 } 1720 if matched = bytes.Equal(mb.lastChecksum(), lchk[:]); !matched { 1721 // Remove the last message block since recover will add in the new one. 1722 fs.removeMsgBlockFromList(mb) 1723 // Reverse update of tracking state for this mb, will add new state in below. 1724 mstate.Msgs -= mb.msgs 1725 mstate.Bytes -= mb.bytes 1726 if nmb, err := fs.recoverMsgBlockNoSubjectUpdates(mb.index); err != nil && !os.IsNotExist(err) { 1727 fs.warn("Stream state could not recover last msg block") 1728 os.Remove(fn) 1729 return errCorruptState 1730 } else if nmb != nil { 1731 fs.adjustAccounting(mb, nmb) 1732 updateTrackingState(&mstate, mb) 1733 } 1734 } 1735 1736 // On success double check our state. 1737 checkState := func() error { 1738 // We check first and last seq and number of msgs and bytes. If there is a difference, 1739 // return and error so we rebuild from the message block state on disk. 1740 if !trackingStatesEqual(&fs.state, &mstate) { 1741 fs.warn("Stream state encountered internal inconsistency on recover") 1742 os.Remove(fn) 1743 return errCorruptState 1744 } 1745 return nil 1746 } 1747 1748 // We may need to check other blocks. Even if we matched last checksum we will see if there is another block. 1749 for bi := blkIndex + 1; ; bi++ { 1750 nmb, err := fs.recoverMsgBlock(bi) 1751 if err != nil { 1752 if os.IsNotExist(err) { 1753 return checkState() 1754 } 1755 os.Remove(fn) 1756 fs.warn("Stream state could not recover msg block %d", bi) 1757 return err 1758 } 1759 if nmb != nil { 1760 // Update top level accounting 1761 if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { 1762 fs.state.FirstSeq = fseq 1763 fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC() 1764 } 1765 if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq { 1766 fs.state.LastSeq = lseq 1767 fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC() 1768 } 1769 fs.state.Msgs += nmb.msgs 1770 fs.state.Bytes += nmb.bytes 1771 updateTrackingState(&mstate, nmb) 1772 } 1773 } 1774 } 1775 1776 // adjustAccounting will be called when a stream state was only partially accounted for 1777 // within a message block, e.g. additional records were added after the stream state. 1778 // Lock should be held. 1779 func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) { 1780 nmb.mu.Lock() 1781 defer nmb.mu.Unlock() 1782 1783 // First make sure the new block is loaded. 1784 if nmb.cacheNotLoaded() { 1785 nmb.loadMsgsWithLock() 1786 } 1787 nmb.ensurePerSubjectInfoLoaded() 1788 1789 // Walk only new messages and update accounting at fs level. Any messages that should have 1790 // triggered limits exceeded will be handled after the recovery and prior to the stream 1791 // being available to the system. 1792 var smv StoreMsg 1793 for seq, lseq := atomic.LoadUint64(&mb.last.seq)+1, atomic.LoadUint64(&nmb.last.seq); seq <= lseq; seq++ { 1794 // Lookup the message. If an error will be deleted, so can skip. 1795 sm, err := nmb.cacheLookup(seq, &smv) 1796 if err != nil { 1797 continue 1798 } 1799 // Since we found it we just need to adjust fs totals and psim. 1800 fs.state.Msgs++ 1801 fs.state.Bytes += fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 1802 if len(sm.subj) > 0 && fs.psim != nil { 1803 if info, ok := fs.psim.Find(stringToBytes(sm.subj)); ok { 1804 info.total++ 1805 if nmb.index > info.lblk { 1806 info.lblk = nmb.index 1807 } 1808 } else { 1809 fs.psim.Insert(stringToBytes(sm.subj), psi{total: 1, fblk: nmb.index, lblk: nmb.index}) 1810 fs.tsl += len(sm.subj) 1811 } 1812 } 1813 } 1814 1815 // Now check to see if we had a higher first for the recovered state mb vs nmb. 1816 if atomic.LoadUint64(&nmb.first.seq) < atomic.LoadUint64(&mb.first.seq) { 1817 // Now set first for nmb. 1818 atomic.StoreUint64(&nmb.first.seq, atomic.LoadUint64(&mb.first.seq)) 1819 } 1820 1821 // Update top level accounting. 1822 if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { 1823 fs.state.FirstSeq = fseq 1824 fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC() 1825 } 1826 if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq { 1827 fs.state.LastSeq = lseq 1828 fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC() 1829 } 1830 } 1831 1832 // Grabs last checksum for the named block file. 1833 // Takes into account encryption etc. 1834 func (mb *msgBlock) lastChecksum() []byte { 1835 f, err := mb.openBlock() 1836 if err != nil { 1837 return nil 1838 } 1839 defer f.Close() 1840 1841 var lchk [8]byte 1842 if fi, _ := f.Stat(); fi != nil { 1843 mb.rbytes = uint64(fi.Size()) 1844 } 1845 if mb.rbytes < checksumSize { 1846 return nil 1847 } 1848 // Encrypted? 1849 // Check for encryption, we do not load keys on startup anymore so might need to load them here. 1850 if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) { 1851 if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil { 1852 return nil 1853 } 1854 } 1855 if mb.bek != nil { 1856 if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize { 1857 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 1858 if err != nil { 1859 return nil 1860 } 1861 mb.bek = bek 1862 mb.bek.XORKeyStream(buf, buf) 1863 copy(lchk[0:], buf[len(buf)-checksumSize:]) 1864 } 1865 } else { 1866 f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) 1867 } 1868 return lchk[:] 1869 } 1870 1871 // This will make sure we clean up old idx and fss files. 1872 func (fs *fileStore) cleanupOldMeta() { 1873 fs.mu.RLock() 1874 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 1875 fs.mu.RUnlock() 1876 1877 <-dios 1878 f, err := os.Open(mdir) 1879 dios <- struct{}{} 1880 if err != nil { 1881 return 1882 } 1883 1884 dirs, _ := f.ReadDir(-1) 1885 f.Close() 1886 1887 const ( 1888 minLen = 4 1889 idxSuffix = ".idx" 1890 fssSuffix = ".fss" 1891 ) 1892 for _, fi := range dirs { 1893 if name := fi.Name(); strings.HasSuffix(name, idxSuffix) || strings.HasSuffix(name, fssSuffix) { 1894 os.Remove(filepath.Join(mdir, name)) 1895 } 1896 } 1897 } 1898 1899 func (fs *fileStore) recoverMsgs() error { 1900 fs.mu.Lock() 1901 defer fs.mu.Unlock() 1902 1903 // Check for any left over purged messages. 1904 <-dios 1905 pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) 1906 if _, err := os.Stat(pdir); err == nil { 1907 os.RemoveAll(pdir) 1908 } 1909 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 1910 f, err := os.Open(mdir) 1911 if err != nil { 1912 dios <- struct{}{} 1913 return errNotReadable 1914 } 1915 dirs, err := f.ReadDir(-1) 1916 f.Close() 1917 dios <- struct{}{} 1918 1919 if err != nil { 1920 return errNotReadable 1921 } 1922 1923 indices := make(sort.IntSlice, 0, len(dirs)) 1924 var index int 1925 for _, fi := range dirs { 1926 if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 { 1927 indices = append(indices, index) 1928 } 1929 } 1930 indices.Sort() 1931 1932 // Recover all of the msg blocks. 1933 // We now guarantee they are coming in order. 1934 for _, index := range indices { 1935 if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil { 1936 // This is a truncate block with possibly no index. If the OS got shutdown 1937 // out from underneath of us this is possible. 1938 if mb.first.seq == 0 { 1939 mb.dirtyCloseWithRemove(true) 1940 fs.removeMsgBlockFromList(mb) 1941 continue 1942 } 1943 if fseq := atomic.LoadUint64(&mb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { 1944 fs.state.FirstSeq = fseq 1945 if mb.first.ts == 0 { 1946 fs.state.FirstTime = time.Time{} 1947 } else { 1948 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 1949 } 1950 } 1951 if lseq := atomic.LoadUint64(&mb.last.seq); lseq > fs.state.LastSeq { 1952 fs.state.LastSeq = lseq 1953 if mb.last.ts == 0 { 1954 fs.state.LastTime = time.Time{} 1955 } else { 1956 fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() 1957 } 1958 } 1959 fs.state.Msgs += mb.msgs 1960 fs.state.Bytes += mb.bytes 1961 } else { 1962 return err 1963 } 1964 } 1965 1966 if len(fs.blks) > 0 { 1967 fs.lmb = fs.blks[len(fs.blks)-1] 1968 } else { 1969 _, err = fs.newMsgBlockForWrite() 1970 } 1971 1972 // Check if we encountered any lost data. 1973 if fs.ld != nil { 1974 var emptyBlks []*msgBlock 1975 for _, mb := range fs.blks { 1976 if mb.msgs == 0 && mb.rbytes == 0 { 1977 emptyBlks = append(emptyBlks, mb) 1978 } 1979 } 1980 for _, mb := range emptyBlks { 1981 // Need the mb lock here. 1982 mb.mu.Lock() 1983 fs.removeMsgBlock(mb) 1984 mb.mu.Unlock() 1985 } 1986 } 1987 1988 if err != nil { 1989 return err 1990 } 1991 1992 // Check for keyfiles orphans. 1993 if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 { 1994 valid := make(map[uint32]bool) 1995 for _, mb := range fs.blks { 1996 valid[mb.index] = true 1997 } 1998 for _, fn := range kms { 1999 var index uint32 2000 shouldRemove := true 2001 if n, err := fmt.Sscanf(filepath.Base(fn), keyScan, &index); err == nil && n == 1 && valid[index] { 2002 shouldRemove = false 2003 } 2004 if shouldRemove { 2005 os.Remove(fn) 2006 } 2007 } 2008 } 2009 2010 return nil 2011 } 2012 2013 // Will expire msgs that have aged out on restart. 2014 // We will treat this differently in case we have a recovery 2015 // that will expire alot of messages on startup. 2016 // Should only be called on startup. 2017 func (fs *fileStore) expireMsgsOnRecover() { 2018 if fs.state.Msgs == 0 { 2019 return 2020 } 2021 2022 var minAge = time.Now().UnixNano() - int64(fs.cfg.MaxAge) 2023 var purged, bytes uint64 2024 var deleted int 2025 var nts int64 2026 2027 // If we expire all make sure to write out a tombstone. Need to be done by hand here, 2028 // usually taken care of by fs.removeMsgBlock() but we do not call that here. 2029 var last msgId 2030 2031 deleteEmptyBlock := func(mb *msgBlock) { 2032 // If we are the last keep state to remember first/last sequence. 2033 // Do this part by hand since not deleting one by one. 2034 if mb == fs.lmb { 2035 last.seq = atomic.LoadUint64(&mb.last.seq) 2036 last.ts = mb.last.ts 2037 } 2038 // Make sure we do subject cleanup as well. 2039 mb.ensurePerSubjectInfoLoaded() 2040 for subj, ss := range mb.fss { 2041 for i := uint64(0); i < ss.Msgs; i++ { 2042 fs.removePerSubject(subj) 2043 } 2044 } 2045 mb.dirtyCloseWithRemove(true) 2046 deleted++ 2047 } 2048 2049 for _, mb := range fs.blks { 2050 mb.mu.Lock() 2051 if minAge < mb.first.ts { 2052 nts = mb.first.ts 2053 mb.mu.Unlock() 2054 break 2055 } 2056 // Can we remove whole block here? 2057 if mb.last.ts <= minAge { 2058 purged += mb.msgs 2059 bytes += mb.bytes 2060 deleteEmptyBlock(mb) 2061 mb.mu.Unlock() 2062 continue 2063 } 2064 2065 // If we are here we have to process the interior messages of this blk. 2066 // This will load fss as well. 2067 if err := mb.loadMsgsWithLock(); err != nil { 2068 mb.mu.Unlock() 2069 break 2070 } 2071 2072 var smv StoreMsg 2073 var needNextFirst bool 2074 2075 // Walk messages and remove if expired. 2076 fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 2077 for seq := fseq; seq <= lseq; seq++ { 2078 sm, err := mb.cacheLookup(seq, &smv) 2079 // Process interior deleted msgs. 2080 if err == errDeletedMsg { 2081 // Update dmap. 2082 if mb.dmap.Exists(seq) { 2083 mb.dmap.Delete(seq) 2084 } 2085 // Keep this updated just in case since we are removing dmap entries. 2086 atomic.StoreUint64(&mb.first.seq, seq) 2087 needNextFirst = true 2088 continue 2089 } 2090 // Break on other errors. 2091 if err != nil || sm == nil { 2092 atomic.StoreUint64(&mb.first.seq, seq) 2093 needNextFirst = true 2094 break 2095 } 2096 2097 // No error and sm != nil from here onward. 2098 2099 // Check for done. 2100 if minAge < sm.ts { 2101 atomic.StoreUint64(&mb.first.seq, sm.seq) 2102 mb.first.ts = sm.ts 2103 needNextFirst = false 2104 nts = sm.ts 2105 break 2106 } 2107 2108 // Delete the message here. 2109 if mb.msgs > 0 { 2110 atomic.StoreUint64(&mb.first.seq, seq) 2111 needNextFirst = true 2112 sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 2113 if sz > mb.bytes { 2114 sz = mb.bytes 2115 } 2116 mb.bytes -= sz 2117 bytes += sz 2118 mb.msgs-- 2119 purged++ 2120 } 2121 // Update fss 2122 // Make sure we have fss loaded. 2123 mb.removeSeqPerSubject(sm.subj, seq) 2124 fs.removePerSubject(sm.subj) 2125 } 2126 // Make sure we have a proper next first sequence. 2127 if needNextFirst { 2128 mb.selectNextFirst() 2129 } 2130 // Check if empty after processing, could happen if tail of messages are all deleted. 2131 if mb.msgs == 0 { 2132 deleteEmptyBlock(mb) 2133 } 2134 mb.mu.Unlock() 2135 break 2136 } 2137 2138 if nts > 0 { 2139 // Make sure to set age check based on this value. 2140 fs.resetAgeChk(nts - minAge) 2141 } 2142 2143 if deleted > 0 { 2144 // Update block map. 2145 if fs.bim != nil { 2146 for _, mb := range fs.blks[:deleted] { 2147 delete(fs.bim, mb.index) 2148 } 2149 } 2150 // Update blks slice. 2151 fs.blks = copyMsgBlocks(fs.blks[deleted:]) 2152 if lb := len(fs.blks); lb == 0 { 2153 fs.lmb = nil 2154 } else { 2155 fs.lmb = fs.blks[lb-1] 2156 } 2157 } 2158 // Update top level accounting. 2159 if purged < fs.state.Msgs { 2160 fs.state.Msgs -= purged 2161 } else { 2162 fs.state.Msgs = 0 2163 } 2164 if bytes < fs.state.Bytes { 2165 fs.state.Bytes -= bytes 2166 } else { 2167 fs.state.Bytes = 0 2168 } 2169 // Make sure to we properly set the fs first sequence and timestamp. 2170 fs.selectNextFirst() 2171 2172 // Check if we have no messages and blocks left. 2173 if fs.lmb == nil && last.seq != 0 { 2174 if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { 2175 lmb.writeTombstone(last.seq, last.ts) 2176 } 2177 // Clear any global subject state. 2178 fs.psim, fs.tsl = fs.psim.Empty(), 0 2179 } 2180 2181 // If we purged anything, make sure we kick flush state loop. 2182 if purged > 0 { 2183 fs.dirty++ 2184 fs.kickFlushStateLoop() 2185 } 2186 } 2187 2188 func copyMsgBlocks(src []*msgBlock) []*msgBlock { 2189 if src == nil { 2190 return nil 2191 } 2192 dst := make([]*msgBlock, len(src)) 2193 copy(dst, src) 2194 return dst 2195 } 2196 2197 // GetSeqFromTime looks for the first sequence number that has 2198 // the message with >= timestamp. 2199 // FIXME(dlc) - inefficient, and dumb really. Make this better. 2200 func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 { 2201 fs.mu.RLock() 2202 lastSeq := fs.state.LastSeq 2203 closed := fs.closed 2204 fs.mu.RUnlock() 2205 2206 if closed { 2207 return 0 2208 } 2209 2210 mb := fs.selectMsgBlockForStart(t) 2211 if mb == nil { 2212 return lastSeq + 1 2213 } 2214 2215 fseq := atomic.LoadUint64(&mb.first.seq) 2216 lseq := atomic.LoadUint64(&mb.last.seq) 2217 2218 var smv StoreMsg 2219 2220 // Linear search, hence the dumb part.. 2221 ts := t.UnixNano() 2222 for seq := fseq; seq <= lseq; seq++ { 2223 sm, _, _ := mb.fetchMsg(seq, &smv) 2224 if sm != nil && sm.ts >= ts { 2225 return sm.seq 2226 } 2227 } 2228 return 0 2229 } 2230 2231 // Find the first matching message. 2232 func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) { 2233 mb.mu.Lock() 2234 defer mb.mu.Unlock() 2235 2236 fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter} 2237 2238 var didLoad bool 2239 if mb.fssNotLoaded() { 2240 // Make sure we have fss loaded. 2241 mb.loadMsgsWithLock() 2242 didLoad = true 2243 } 2244 2245 // If we only have 1 subject currently and it matches our filter we can also set isAll. 2246 if !isAll && len(mb.fss) == 1 { 2247 _, isAll = mb.fss[filter] 2248 } 2249 // Make sure to start at mb.first.seq if fseq < mb.first.seq 2250 if seq := atomic.LoadUint64(&mb.first.seq); seq > fseq { 2251 fseq = seq 2252 } 2253 lseq := atomic.LoadUint64(&mb.last.seq) 2254 2255 // Optionally build the isMatch for wildcard filters. 2256 tsa := [32]string{} 2257 fsa := [32]string{} 2258 var fts []string 2259 var isMatch func(subj string) bool 2260 // Decide to build. 2261 if wc { 2262 fts = tokenizeSubjectIntoSlice(fsa[:0], filter) 2263 isMatch = func(subj string) bool { 2264 tts := tokenizeSubjectIntoSlice(tsa[:0], subj) 2265 return isSubsetMatchTokenized(tts, fts) 2266 } 2267 } 2268 // Only do linear scan if isAll or we are wildcarded and have to traverse more fss than actual messages. 2269 doLinearScan := isAll || (wc && len(mb.fss) > int(lseq-fseq)) 2270 if !doLinearScan { 2271 // If we have a wildcard match against all tracked subjects we know about. 2272 if wc { 2273 subs = subs[:0] 2274 for subj := range mb.fss { 2275 if isMatch(subj) { 2276 subs = append(subs, subj) 2277 } 2278 } 2279 } 2280 fseq = lseq + 1 2281 for _, subj := range subs { 2282 ss := mb.fss[subj] 2283 if ss != nil && ss.firstNeedsUpdate { 2284 mb.recalculateFirstForSubj(subj, ss.First, ss) 2285 } 2286 if ss == nil || start > ss.Last || ss.First >= fseq { 2287 continue 2288 } 2289 if ss.First < start { 2290 fseq = start 2291 } else { 2292 fseq = ss.First 2293 } 2294 } 2295 } 2296 2297 // If we guess to not do a linear scan, but the above resulted in alot of subs that will 2298 // need to be checked for every scanned message, revert. 2299 // TODO(dlc) - we could memoize the subs across calls. 2300 if len(subs) > int(lseq-fseq) { 2301 doLinearScan = true 2302 } 2303 2304 if fseq > lseq { 2305 return nil, didLoad, ErrStoreMsgNotFound 2306 } 2307 2308 // Need messages loaded from here on out. 2309 if mb.cacheNotLoaded() { 2310 if err := mb.loadMsgsWithLock(); err != nil { 2311 return nil, false, err 2312 } 2313 didLoad = true 2314 } 2315 2316 if sm == nil { 2317 sm = new(StoreMsg) 2318 } 2319 2320 for seq := fseq; seq <= lseq; seq++ { 2321 llseq := mb.llseq 2322 fsm, err := mb.cacheLookup(seq, sm) 2323 if err != nil { 2324 continue 2325 } 2326 expireOk := seq == lseq && mb.llseq == seq 2327 if isAll { 2328 return fsm, expireOk, nil 2329 } 2330 if doLinearScan { 2331 if wc && isMatch(sm.subj) { 2332 return fsm, expireOk, nil 2333 } else if !wc && fsm.subj == filter { 2334 return fsm, expireOk, nil 2335 } 2336 } else { 2337 for _, subj := range subs { 2338 if fsm.subj == subj { 2339 return fsm, expireOk, nil 2340 } 2341 } 2342 } 2343 // If we are here we did not match, so put the llseq back. 2344 mb.llseq = llseq 2345 } 2346 2347 return nil, didLoad, ErrStoreMsgNotFound 2348 } 2349 2350 // This will traverse a message block and generate the filtered pending. 2351 func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) { 2352 mb.mu.Lock() 2353 defer mb.mu.Unlock() 2354 return mb.filteredPendingLocked(subj, wc, seq) 2355 } 2356 2357 // This will traverse a message block and generate the filtered pending. 2358 // Lock should be held. 2359 func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) { 2360 isAll := filter == _EMPTY_ || filter == fwcs 2361 2362 // First check if we can optimize this part. 2363 // This means we want all and the starting sequence was before this block. 2364 if isAll { 2365 if fseq := atomic.LoadUint64(&mb.first.seq); sseq <= fseq { 2366 return mb.msgs, fseq, atomic.LoadUint64(&mb.last.seq) 2367 } 2368 } 2369 2370 update := func(ss *SimpleState) { 2371 total += ss.Msgs 2372 if first == 0 || ss.First < first { 2373 first = ss.First 2374 } 2375 if ss.Last > last { 2376 last = ss.Last 2377 } 2378 } 2379 2380 // Make sure we have fss loaded. 2381 mb.ensurePerSubjectInfoLoaded() 2382 2383 tsa := [32]string{} 2384 fsa := [32]string{} 2385 fts := tokenizeSubjectIntoSlice(fsa[:0], filter) 2386 2387 // 1. See if we match any subs from fss. 2388 // 2. If we match and the sseq is past ss.Last then we can use meta only. 2389 // 3. If we match and we need to do a partial, break and clear any totals and do a full scan like num pending. 2390 2391 isMatch := func(subj string) bool { 2392 if !wc { 2393 return subj == filter 2394 } 2395 tts := tokenizeSubjectIntoSlice(tsa[:0], subj) 2396 return isSubsetMatchTokenized(tts, fts) 2397 } 2398 2399 var havePartial bool 2400 for subj, ss := range mb.fss { 2401 if isAll || isMatch(subj) { 2402 if ss.firstNeedsUpdate { 2403 mb.recalculateFirstForSubj(subj, ss.First, ss) 2404 } 2405 if sseq <= ss.First { 2406 update(ss) 2407 } else if sseq <= ss.Last { 2408 // We matched but its a partial. 2409 havePartial = true 2410 break 2411 } 2412 } 2413 } 2414 2415 // If we did not encounter any partials we can return here. 2416 if !havePartial { 2417 return total, first, last 2418 } 2419 2420 // If we are here we need to scan the msgs. 2421 // Clear what we had. 2422 total, first, last = 0, 0, 0 2423 2424 // If we load the cache for a linear scan we want to expire that cache upon exit. 2425 var shouldExpire bool 2426 if mb.cacheNotLoaded() { 2427 mb.loadMsgsWithLock() 2428 shouldExpire = true 2429 } 2430 2431 var smv StoreMsg 2432 for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { 2433 sm, _ := mb.cacheLookup(seq, &smv) 2434 if sm == nil { 2435 continue 2436 } 2437 if isAll || isMatch(sm.subj) { 2438 total++ 2439 if first == 0 || seq < first { 2440 first = seq 2441 } 2442 if seq > last { 2443 last = seq 2444 } 2445 } 2446 } 2447 // If we loaded this block for this operation go ahead and expire it here. 2448 if shouldExpire { 2449 mb.tryForceExpireCacheLocked() 2450 } 2451 2452 return total, first, last 2453 } 2454 2455 // FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence. 2456 func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState { 2457 fs.mu.RLock() 2458 defer fs.mu.RUnlock() 2459 2460 lseq := fs.state.LastSeq 2461 if sseq < fs.state.FirstSeq { 2462 sseq = fs.state.FirstSeq 2463 } 2464 2465 // Returned state. 2466 var ss SimpleState 2467 2468 // If past the end no results. 2469 if sseq > lseq { 2470 // Make sure we track sequences 2471 ss.First = fs.state.FirstSeq 2472 ss.Last = fs.state.LastSeq 2473 return ss 2474 } 2475 2476 // If we want all msgs that match we can shortcircuit. 2477 // TODO(dlc) - This can be extended for all cases but would 2478 // need to be careful on total msgs calculations etc. 2479 if sseq == fs.state.FirstSeq { 2480 fs.numFilteredPending(subj, &ss) 2481 } else { 2482 wc := subjectHasWildcard(subj) 2483 // Tracking subject state. 2484 // TODO(dlc) - Optimize for 2.10 with avl tree and no atomics per block. 2485 for _, mb := range fs.blks { 2486 // Skip blocks that are less than our starting sequence. 2487 if sseq > atomic.LoadUint64(&mb.last.seq) { 2488 continue 2489 } 2490 t, f, l := mb.filteredPending(subj, wc, sseq) 2491 ss.Msgs += t 2492 if ss.First == 0 || (f > 0 && f < ss.First) { 2493 ss.First = f 2494 } 2495 if l > ss.Last { 2496 ss.Last = l 2497 } 2498 } 2499 } 2500 2501 return ss 2502 } 2503 2504 // Optimized way for getting all num pending matching a filter subject. 2505 // Lock should be held. 2506 func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) { 2507 isAll := filter == _EMPTY_ || filter == fwcs 2508 2509 // If isAll we do not need to do anything special to calculate the first and last and total. 2510 if isAll { 2511 ss.First = fs.state.FirstSeq 2512 ss.Last = fs.state.LastSeq 2513 ss.Msgs = fs.state.Msgs 2514 return 2515 } 2516 2517 start, stop := uint32(math.MaxUint32), uint32(0) 2518 fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) { 2519 ss.Msgs += psi.total 2520 // Keep track of start and stop indexes for this subject. 2521 if psi.fblk < start { 2522 start = psi.fblk 2523 } 2524 if psi.lblk > stop { 2525 stop = psi.lblk 2526 } 2527 }) 2528 // We do need to figure out the first and last sequences. 2529 wc := subjectHasWildcard(filter) 2530 // Do start 2531 mb := fs.bim[start] 2532 if mb != nil { 2533 _, f, _ := mb.filteredPending(filter, wc, 0) 2534 ss.First = f 2535 } 2536 if ss.First == 0 { 2537 // This is a miss. This can happen since psi.fblk is lazy, but should be very rare. 2538 for i := start + 1; i <= stop; i++ { 2539 mb := fs.bim[i] 2540 if mb == nil { 2541 continue 2542 } 2543 if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 { 2544 ss.First = f 2545 break 2546 } 2547 } 2548 } 2549 // Now last 2550 if mb = fs.bim[stop]; mb != nil { 2551 _, _, l := mb.filteredPending(filter, wc, 0) 2552 ss.Last = l 2553 } 2554 } 2555 2556 // SubjectsState returns a map of SimpleState for all matching subjects. 2557 func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState { 2558 fs.mu.RLock() 2559 defer fs.mu.RUnlock() 2560 2561 if fs.state.Msgs == 0 || fs.noTrackSubjects() { 2562 return nil 2563 } 2564 2565 start, stop := fs.blks[0], fs.lmb 2566 // We can short circuit if not a wildcard using psim for start and stop. 2567 if !subjectHasWildcard(subject) { 2568 info, ok := fs.psim.Find(stringToBytes(subject)) 2569 if !ok { 2570 return nil 2571 } 2572 start, stop = fs.bim[info.fblk], fs.bim[info.lblk] 2573 } 2574 2575 // Aggregate fss. 2576 fss := make(map[string]SimpleState) 2577 var startFound bool 2578 2579 for _, mb := range fs.blks { 2580 if !startFound { 2581 if mb != start { 2582 continue 2583 } 2584 startFound = true 2585 } 2586 2587 mb.mu.Lock() 2588 var shouldExpire bool 2589 if mb.fssNotLoaded() { 2590 // Make sure we have fss loaded. 2591 mb.loadMsgsWithLock() 2592 shouldExpire = true 2593 } 2594 for subj, ss := range mb.fss { 2595 if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) { 2596 if ss.firstNeedsUpdate { 2597 mb.recalculateFirstForSubj(subj, ss.First, ss) 2598 } 2599 oss := fss[subj] 2600 if oss.First == 0 { // New 2601 fss[subj] = *ss 2602 } else { 2603 // Merge here. 2604 oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs 2605 fss[subj] = oss 2606 } 2607 } 2608 } 2609 if shouldExpire { 2610 // Expire this cache before moving on. 2611 mb.tryForceExpireCacheLocked() 2612 } 2613 mb.mu.Unlock() 2614 2615 if mb == stop { 2616 break 2617 } 2618 } 2619 2620 return fss 2621 } 2622 2623 // MultiLastSeqs will return a sorted list of sequences that match all subjects presented in filters. 2624 // We will not exceed the maxSeq, which if 0 becomes the store's last sequence. 2625 func (fs *fileStore) MultiLastSeqs(filters []string, maxSeq uint64, maxAllowed int) ([]uint64, error) { 2626 fs.mu.RLock() 2627 defer fs.mu.RUnlock() 2628 2629 if fs.state.Msgs == 0 || fs.noTrackSubjects() { 2630 return nil, nil 2631 } 2632 2633 lastBlkIndex := len(fs.blks) - 1 2634 lastMB := fs.blks[lastBlkIndex] 2635 2636 // Implied last sequence. 2637 if maxSeq == 0 { 2638 maxSeq = fs.state.LastSeq 2639 } else { 2640 // Udate last mb index if not last seq. 2641 lastBlkIndex, lastMB = fs.selectMsgBlockWithIndex(maxSeq) 2642 } 2643 //Make sure non-nil 2644 if lastMB == nil { 2645 return nil, nil 2646 } 2647 2648 // Grab our last mb index (not same as blk index). 2649 lastMB.mu.RLock() 2650 lastMBIndex := lastMB.index 2651 lastMB.mu.RUnlock() 2652 2653 subs := make(map[string]*psi) 2654 ltSeen := make(map[string]uint32) 2655 for _, filter := range filters { 2656 fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) { 2657 s := string(subj) 2658 subs[s] = psi 2659 if psi.lblk < lastMBIndex { 2660 ltSeen[s] = psi.lblk 2661 } 2662 }) 2663 } 2664 2665 // If all subjects have a lower last index, select the largest for our walk backwards. 2666 if len(ltSeen) == len(subs) { 2667 max := uint32(0) 2668 for _, mbi := range ltSeen { 2669 if mbi > max { 2670 max = mbi 2671 } 2672 } 2673 lastMB = fs.bim[max] 2674 } 2675 2676 // Collect all sequences needed. 2677 seqs := make([]uint64, 0, len(subs)) 2678 for i, lnf := lastBlkIndex, false; i >= 0; i-- { 2679 if len(subs) == 0 { 2680 break 2681 } 2682 mb := fs.blks[i] 2683 if !lnf { 2684 if mb != lastMB { 2685 continue 2686 } 2687 lnf = true 2688 } 2689 // We can start properly looking here. 2690 mb.mu.Lock() 2691 mb.ensurePerSubjectInfoLoaded() 2692 for subj, psi := range subs { 2693 if ss := mb.fss[subj]; ss != nil { 2694 if ss.Last <= maxSeq { 2695 seqs = append(seqs, ss.Last) 2696 delete(subs, subj) 2697 } else { 2698 // Need to search for it since last is > maxSeq. 2699 if mb.cacheNotLoaded() { 2700 mb.loadMsgsWithLock() 2701 } 2702 var smv StoreMsg 2703 fseq := atomic.LoadUint64(&mb.first.seq) 2704 for seq := maxSeq; seq >= fseq; seq-- { 2705 sm, _ := mb.cacheLookup(seq, &smv) 2706 if sm == nil || sm.subj != subj { 2707 continue 2708 } 2709 seqs = append(seqs, sm.seq) 2710 delete(subs, subj) 2711 break 2712 } 2713 } 2714 } else if mb.index <= psi.fblk { 2715 // Track which subs are no longer applicable, meaning we will not find a valid msg at this point. 2716 delete(subs, subj) 2717 } 2718 // TODO(dlc) we could track lblk like above in case some subs are very far apart. 2719 // Not too bad if fss loaded since we will skip over quickly with it loaded, but might be worth it. 2720 } 2721 mb.mu.Unlock() 2722 2723 // If maxAllowed was sepcified check that we will not exceed that. 2724 if maxAllowed > 0 && len(seqs) > maxAllowed { 2725 return nil, ErrTooManyResults 2726 } 2727 2728 } 2729 if len(seqs) == 0 { 2730 return nil, nil 2731 } 2732 sort.Slice(seqs, func(i, j int) bool { return seqs[i] < seqs[j] }) 2733 return seqs, nil 2734 } 2735 2736 // NumPending will return the number of pending messages matching the filter subject starting at sequence. 2737 // Optimized for stream num pending calculations for consumers. 2738 func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) { 2739 fs.mu.RLock() 2740 defer fs.mu.RUnlock() 2741 2742 // This can always be last for these purposes. 2743 validThrough = fs.state.LastSeq 2744 2745 if fs.state.Msgs == 0 || sseq > fs.state.LastSeq { 2746 return 0, validThrough 2747 } 2748 2749 // Track starting for both block for the sseq and staring block that matches any subject. 2750 var seqStart int 2751 // See if we need to figure out starting block per sseq. 2752 if sseq > fs.state.FirstSeq { 2753 // This should not, but can return -1, so make sure we check to avoid panic below. 2754 if seqStart, _ = fs.selectMsgBlockWithIndex(sseq); seqStart < 0 { 2755 seqStart = 0 2756 } 2757 } 2758 2759 isAll := filter == _EMPTY_ || filter == fwcs 2760 wc := subjectHasWildcard(filter) 2761 2762 // See if filter was provided but its the only subject. 2763 if !isAll && !wc && fs.psim.Size() == 1 { 2764 if _, ok := fs.psim.Find(stringToBytes(filter)); ok { 2765 isAll = true 2766 } 2767 } 2768 if isAll && filter == _EMPTY_ { 2769 filter = fwcs 2770 } 2771 // If we are isAll and have no deleted we can do a simpler calculation. 2772 if !lastPerSubject && isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs { 2773 if sseq == 0 { 2774 return fs.state.Msgs, validThrough 2775 } 2776 return fs.state.LastSeq - sseq + 1, validThrough 2777 } 2778 2779 var tsa, fsa [32]string 2780 fts := tokenizeSubjectIntoSlice(fsa[:0], filter) 2781 2782 isMatch := func(subj string) bool { 2783 if isAll { 2784 return true 2785 } 2786 if !wc { 2787 return subj == filter 2788 } 2789 tts := tokenizeSubjectIntoSlice(tsa[:0], subj) 2790 return isSubsetMatchTokenized(tts, fts) 2791 } 2792 2793 // Handle last by subject a bit differently. 2794 // We will scan PSIM since we accurately track the last block we have seen the subject in. This 2795 // allows us to only need to load at most one block now. 2796 // For the last block, we need to track the subjects that we know are in that block, and track seen 2797 // while in the block itself, but complexity there worth it. 2798 if lastPerSubject { 2799 // If we want all and our start sequence is equal or less than first return number of subjects. 2800 if isAll && sseq <= fs.state.FirstSeq { 2801 return uint64(fs.psim.Size()), validThrough 2802 } 2803 // If we are here we need to scan. We are going to scan the PSIM looking for lblks that are >= seqStart. 2804 // This will build up a list of all subjects from the selected block onward. 2805 lbm := make(map[string]bool) 2806 mb := fs.blks[seqStart] 2807 bi := mb.index 2808 2809 fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) { 2810 // If the select blk start is greater than entry's last blk skip. 2811 if bi > psi.lblk { 2812 return 2813 } 2814 total++ 2815 // We will track the subjects that are an exact match to the last block. 2816 // This is needed for last block processing. 2817 if psi.lblk == bi { 2818 lbm[string(subj)] = true 2819 } 2820 }) 2821 2822 // Now check if we need to inspect the seqStart block. 2823 // Grab write lock in case we need to load in msgs. 2824 mb.mu.Lock() 2825 var shouldExpire bool 2826 // We need to walk this block to correct accounting from above. 2827 if sseq > mb.first.seq { 2828 // Track the ones we add back in case more than one. 2829 seen := make(map[string]bool) 2830 // We need to discount the total by subjects seen before sseq, but also add them right back in if they are >= sseq for this blk. 2831 // This only should be subjects we know have the last blk in this block. 2832 if mb.cacheNotLoaded() { 2833 mb.loadMsgsWithLock() 2834 shouldExpire = true 2835 } 2836 var smv StoreMsg 2837 for seq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { 2838 sm, _ := mb.cacheLookup(seq, &smv) 2839 if sm == nil || sm.subj == _EMPTY_ || !lbm[sm.subj] { 2840 continue 2841 } 2842 if isMatch(sm.subj) { 2843 // If less than sseq adjust off of total as long as this subject matched the last block. 2844 if seq < sseq { 2845 if !seen[sm.subj] { 2846 total-- 2847 seen[sm.subj] = true 2848 } 2849 } else if seen[sm.subj] { 2850 // This is equal or more than sseq, so add back in. 2851 total++ 2852 // Make sure to not process anymore. 2853 delete(seen, sm.subj) 2854 } 2855 } 2856 } 2857 } 2858 // If we loaded the block try to force expire. 2859 if shouldExpire { 2860 mb.tryForceExpireCacheLocked() 2861 } 2862 mb.mu.Unlock() 2863 return total, validThrough 2864 } 2865 2866 // If we would need to scan more from the beginning, revert back to calculating directly here. 2867 // TODO(dlc) - Redo properly with sublists etc for subject-based filtering. 2868 if seqStart >= (len(fs.blks) / 2) { 2869 for i := seqStart; i < len(fs.blks); i++ { 2870 var shouldExpire bool 2871 mb := fs.blks[i] 2872 // Hold write lock in case we need to load cache. 2873 mb.mu.Lock() 2874 var t uint64 2875 if isAll && sseq <= atomic.LoadUint64(&mb.first.seq) { 2876 total += mb.msgs 2877 mb.mu.Unlock() 2878 continue 2879 } 2880 // If we are here we need to at least scan the subject fss. 2881 // Make sure we have fss loaded. 2882 if mb.fssNotLoaded() { 2883 mb.loadMsgsWithLock() 2884 shouldExpire = true 2885 } 2886 var havePartial bool 2887 for subj, ss := range mb.fss { 2888 if isMatch(subj) { 2889 if ss.firstNeedsUpdate { 2890 mb.recalculateFirstForSubj(subj, ss.First, ss) 2891 } 2892 if sseq <= ss.First { 2893 t += ss.Msgs 2894 } else if sseq <= ss.Last { 2895 // We matched but its a partial. 2896 havePartial = true 2897 break 2898 } 2899 } 2900 } 2901 // See if we need to scan msgs here. 2902 if havePartial { 2903 // Make sure we have the cache loaded. 2904 if mb.cacheNotLoaded() { 2905 mb.loadMsgsWithLock() 2906 shouldExpire = true 2907 } 2908 // Clear on partial. 2909 t = 0 2910 var smv StoreMsg 2911 for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { 2912 if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && isMatch(sm.subj) { 2913 t++ 2914 } 2915 } 2916 } 2917 // If we loaded this block for this operation go ahead and expire it here. 2918 if shouldExpire { 2919 mb.tryForceExpireCacheLocked() 2920 } 2921 mb.mu.Unlock() 2922 total += t 2923 } 2924 return total, validThrough 2925 } 2926 2927 // If we are here it's better to calculate totals from psim and adjust downward by scanning less blocks. 2928 // TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead. 2929 start := uint32(math.MaxUint32) 2930 fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) { 2931 total += psi.total 2932 // Keep track of start index for this subject. 2933 if psi.fblk < start { 2934 start = psi.fblk 2935 } 2936 }) 2937 // See if we were asked for all, if so we are done. 2938 if sseq <= fs.state.FirstSeq { 2939 return total, validThrough 2940 } 2941 2942 // If we are here we need to calculate partials for the first blocks. 2943 firstSubjBlk := fs.bim[start] 2944 var firstSubjBlkFound bool 2945 // Adjust in case not found. 2946 if firstSubjBlk == nil { 2947 firstSubjBlkFound = true 2948 } 2949 2950 // Track how many we need to adjust against the total. 2951 var adjust uint64 2952 for i := 0; i <= seqStart; i++ { 2953 mb := fs.blks[i] 2954 // We can skip blks if we know they are below the first one that has any subject matches. 2955 if !firstSubjBlkFound { 2956 if firstSubjBlkFound = (mb == firstSubjBlk); !firstSubjBlkFound { 2957 continue 2958 } 2959 } 2960 // We need to scan this block. 2961 var shouldExpire bool 2962 mb.mu.Lock() 2963 // Check if we should include all of this block in adjusting. If so work with metadata. 2964 if sseq > atomic.LoadUint64(&mb.last.seq) { 2965 if isAll { 2966 adjust += mb.msgs 2967 } else { 2968 // We need to adjust for all matches in this block. 2969 // Make sure we have fss loaded. This loads whole block now. 2970 if mb.fssNotLoaded() { 2971 mb.loadMsgsWithLock() 2972 shouldExpire = true 2973 } 2974 for subj, ss := range mb.fss { 2975 if isMatch(subj) { 2976 adjust += ss.Msgs 2977 } 2978 } 2979 } 2980 } else { 2981 // This is the last block. We need to scan per message here. 2982 if mb.cacheNotLoaded() { 2983 mb.loadMsgsWithLock() 2984 shouldExpire = true 2985 } 2986 var last = atomic.LoadUint64(&mb.last.seq) 2987 if sseq < last { 2988 last = sseq 2989 } 2990 // We need to walk all messages in this block 2991 var smv StoreMsg 2992 for seq := atomic.LoadUint64(&mb.first.seq); seq < last; seq++ { 2993 sm, _ := mb.cacheLookup(seq, &smv) 2994 if sm == nil || sm.subj == _EMPTY_ { 2995 continue 2996 } 2997 // Check if it matches our filter. 2998 if sm.seq < sseq && isMatch(sm.subj) { 2999 adjust++ 3000 } 3001 } 3002 } 3003 // If we loaded the block try to force expire. 3004 if shouldExpire { 3005 mb.tryForceExpireCacheLocked() 3006 } 3007 mb.mu.Unlock() 3008 } 3009 // Make final adjustment. 3010 total -= adjust 3011 3012 return total, validThrough 3013 } 3014 3015 // SubjectsTotal return message totals per subject. 3016 func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 { 3017 fs.mu.RLock() 3018 defer fs.mu.RUnlock() 3019 3020 if fs.psim.Size() == 0 { 3021 return nil 3022 } 3023 // Match all if no filter given. 3024 if filter == _EMPTY_ { 3025 filter = fwcs 3026 } 3027 fst := make(map[string]uint64) 3028 fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) { 3029 fst[string(subj)] = psi.total 3030 }) 3031 return fst 3032 } 3033 3034 // RegisterStorageUpdates registers a callback for updates to storage changes. 3035 // It will present number of messages and bytes as a signed integer and an 3036 // optional sequence number of the message if a single. 3037 func (fs *fileStore) RegisterStorageUpdates(cb StorageUpdateHandler) { 3038 fs.mu.Lock() 3039 fs.scb = cb 3040 bsz := fs.state.Bytes 3041 fs.mu.Unlock() 3042 if cb != nil && bsz > 0 { 3043 cb(0, int64(bsz), 0, _EMPTY_) 3044 } 3045 } 3046 3047 // Helper to get hash key for specific message block. 3048 // Lock should be held 3049 func (fs *fileStore) hashKeyForBlock(index uint32) []byte { 3050 return []byte(fmt.Sprintf("%s-%d", fs.cfg.Name, index)) 3051 } 3052 3053 func (mb *msgBlock) setupWriteCache(buf []byte) { 3054 // Make sure we have a cache setup. 3055 if mb.cache != nil { 3056 return 3057 } 3058 3059 // Setup simple cache. 3060 mb.cache = &cache{buf: buf} 3061 // Make sure we set the proper cache offset if we have existing data. 3062 var fi os.FileInfo 3063 if mb.mfd != nil { 3064 fi, _ = mb.mfd.Stat() 3065 } else if mb.mfn != _EMPTY_ { 3066 fi, _ = os.Stat(mb.mfn) 3067 } 3068 if fi != nil { 3069 mb.cache.off = int(fi.Size()) 3070 } 3071 mb.llts = time.Now().UnixNano() 3072 mb.startCacheExpireTimer() 3073 } 3074 3075 // This rolls to a new append msg block. 3076 // Lock should be held. 3077 func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { 3078 index := uint32(1) 3079 var rbuf []byte 3080 3081 if lmb := fs.lmb; lmb != nil { 3082 index = lmb.index + 1 3083 // Determine if we can reclaim any resources here. 3084 if fs.fip { 3085 lmb.mu.Lock() 3086 lmb.closeFDsLocked() 3087 if lmb.cache != nil { 3088 // Reset write timestamp and see if we can expire this cache. 3089 rbuf = lmb.tryExpireWriteCache() 3090 } 3091 lmb.mu.Unlock() 3092 } 3093 } 3094 3095 mb := fs.initMsgBlock(index) 3096 // Lock should be held to quiet race detector. 3097 mb.mu.Lock() 3098 mb.setupWriteCache(rbuf) 3099 mb.fss = make(map[string]*SimpleState) 3100 3101 // Set cache time to creation time to start. 3102 ts := time.Now().UnixNano() 3103 mb.llts, mb.lwts = 0, ts 3104 // Remember our last sequence number. 3105 atomic.StoreUint64(&mb.first.seq, fs.state.LastSeq+1) 3106 atomic.StoreUint64(&mb.last.seq, fs.state.LastSeq) 3107 mb.mu.Unlock() 3108 3109 // Now do local hash. 3110 key := sha256.Sum256(fs.hashKeyForBlock(index)) 3111 hh, err := highwayhash.New64(key[:]) 3112 if err != nil { 3113 return nil, fmt.Errorf("could not create hash: %v", err) 3114 } 3115 mb.hh = hh 3116 3117 <-dios 3118 mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms) 3119 dios <- struct{}{} 3120 3121 if err != nil { 3122 mb.dirtyCloseWithRemove(true) 3123 return nil, fmt.Errorf("Error creating msg block file: %v", err) 3124 } 3125 mb.mfd = mfd 3126 3127 // Check if encryption is enabled. 3128 if fs.prf != nil { 3129 if err := fs.genEncryptionKeysForBlock(mb); err != nil { 3130 return nil, err 3131 } 3132 } 3133 3134 // If we know we will need this so go ahead and spin up. 3135 if !fs.fip { 3136 mb.spinUpFlushLoop() 3137 } 3138 3139 // Add to our list of blocks and mark as last. 3140 fs.addMsgBlock(mb) 3141 3142 if fs.dirty > 0 { 3143 fs.kickFlushStateLoop() 3144 } 3145 3146 return mb, nil 3147 } 3148 3149 // Generate the keys for this message block and write them out. 3150 func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error { 3151 if mb == nil { 3152 return nil 3153 } 3154 key, bek, seed, encrypted, err := fs.genEncryptionKeys(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)) 3155 if err != nil { 3156 return err 3157 } 3158 mb.aek, mb.bek, mb.seed, mb.nonce = key, bek, seed, encrypted[:key.NonceSize()] 3159 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 3160 keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)) 3161 if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) { 3162 return err 3163 } 3164 <-dios 3165 err = os.WriteFile(keyFile, encrypted, defaultFilePerms) 3166 dios <- struct{}{} 3167 if err != nil { 3168 return err 3169 } 3170 mb.kfn = keyFile 3171 return nil 3172 } 3173 3174 // Stores a raw message with expected sequence number and timestamp. 3175 // Lock should be held. 3176 func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) (err error) { 3177 if fs.closed { 3178 return ErrStoreClosed 3179 } 3180 3181 // Per subject max check needed. 3182 mmp := uint64(fs.cfg.MaxMsgsPer) 3183 var psmc uint64 3184 psmax := mmp > 0 && len(subj) > 0 3185 if psmax { 3186 if info, ok := fs.psim.Find(stringToBytes(subj)); ok { 3187 psmc = info.total 3188 } 3189 } 3190 3191 var fseq uint64 3192 // Check if we are discarding new messages when we reach the limit. 3193 if fs.cfg.Discard == DiscardNew { 3194 var asl bool 3195 if psmax && psmc >= mmp { 3196 // If we are instructed to discard new per subject, this is an error. 3197 if fs.cfg.DiscardNewPer { 3198 return ErrMaxMsgsPerSubject 3199 } 3200 if fseq, err = fs.firstSeqForSubj(subj); err != nil { 3201 return err 3202 } 3203 asl = true 3204 } 3205 if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl { 3206 return ErrMaxMsgs 3207 } 3208 if fs.cfg.MaxBytes > 0 && fs.state.Bytes+fileStoreMsgSize(subj, hdr, msg) >= uint64(fs.cfg.MaxBytes) { 3209 if !asl || fs.sizeForSeq(fseq) <= int(fileStoreMsgSize(subj, hdr, msg)) { 3210 return ErrMaxBytes 3211 } 3212 } 3213 } 3214 3215 // Check sequence. 3216 if seq != fs.state.LastSeq+1 { 3217 if seq > 0 { 3218 return ErrSequenceMismatch 3219 } 3220 seq = fs.state.LastSeq + 1 3221 } 3222 3223 // Write msg record. 3224 n, err := fs.writeMsgRecord(seq, ts, subj, hdr, msg) 3225 if err != nil { 3226 return err 3227 } 3228 3229 // Adjust top level tracking of per subject msg counts. 3230 if len(subj) > 0 && fs.psim != nil { 3231 index := fs.lmb.index 3232 if info, ok := fs.psim.Find(stringToBytes(subj)); ok { 3233 info.total++ 3234 if index > info.lblk { 3235 info.lblk = index 3236 } 3237 } else { 3238 fs.psim.Insert(stringToBytes(subj), psi{total: 1, fblk: index, lblk: index}) 3239 fs.tsl += len(subj) 3240 } 3241 } 3242 3243 // Adjust first if needed. 3244 now := time.Unix(0, ts).UTC() 3245 if fs.state.Msgs == 0 { 3246 fs.state.FirstSeq = seq 3247 fs.state.FirstTime = now 3248 } 3249 3250 fs.state.Msgs++ 3251 fs.state.Bytes += n 3252 fs.state.LastSeq = seq 3253 fs.state.LastTime = now 3254 3255 // Enforce per message limits. 3256 // We snapshotted psmc before our actual write, so >= comparison needed. 3257 if psmax && psmc >= mmp { 3258 // We may have done this above. 3259 if fseq == 0 { 3260 fseq, _ = fs.firstSeqForSubj(subj) 3261 } 3262 if ok, _ := fs.removeMsgViaLimits(fseq); ok { 3263 // Make sure we are below the limit. 3264 if psmc--; psmc >= mmp { 3265 bsubj := stringToBytes(subj) 3266 for info, ok := fs.psim.Find(bsubj); ok && info.total > mmp; info, ok = fs.psim.Find(bsubj) { 3267 if seq, _ := fs.firstSeqForSubj(subj); seq > 0 { 3268 if ok, _ := fs.removeMsgViaLimits(seq); !ok { 3269 break 3270 } 3271 } else { 3272 break 3273 } 3274 } 3275 } 3276 } else if mb := fs.selectMsgBlock(fseq); mb != nil { 3277 // If we are here we could not remove fseq from above, so rebuild. 3278 var ld *LostStreamData 3279 if ld, _, _ = mb.rebuildState(); ld != nil { 3280 fs.rebuildStateLocked(ld) 3281 } 3282 } 3283 } 3284 3285 // Limits checks and enforcement. 3286 // If they do any deletions they will update the 3287 // byte count on their own, so no need to compensate. 3288 fs.enforceMsgLimit() 3289 fs.enforceBytesLimit() 3290 3291 // Check if we have and need the age expiration timer running. 3292 if fs.ageChk == nil && fs.cfg.MaxAge != 0 { 3293 fs.startAgeChk() 3294 } 3295 3296 return nil 3297 } 3298 3299 // StoreRawMsg stores a raw message with expected sequence number and timestamp. 3300 func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) error { 3301 fs.mu.Lock() 3302 err := fs.storeRawMsg(subj, hdr, msg, seq, ts) 3303 cb := fs.scb 3304 // Check if first message timestamp requires expiry 3305 // sooner than initial replica expiry timer set to MaxAge when initializing. 3306 if !fs.receivedAny && fs.cfg.MaxAge != 0 && ts > 0 { 3307 fs.receivedAny = true 3308 // don't block here by calling expireMsgs directly. 3309 // Instead, set short timeout. 3310 fs.resetAgeChk(int64(time.Millisecond * 50)) 3311 } 3312 fs.mu.Unlock() 3313 3314 if err == nil && cb != nil { 3315 cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj) 3316 } 3317 3318 return err 3319 } 3320 3321 // Store stores a message. We hold the main filestore lock for any write operation. 3322 func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) { 3323 fs.mu.Lock() 3324 seq, ts := fs.state.LastSeq+1, time.Now().UnixNano() 3325 err := fs.storeRawMsg(subj, hdr, msg, seq, ts) 3326 cb := fs.scb 3327 fs.mu.Unlock() 3328 3329 if err != nil { 3330 seq, ts = 0, 0 3331 } else if cb != nil { 3332 cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj) 3333 } 3334 3335 return seq, ts, err 3336 } 3337 3338 // skipMsg will update this message block for a skipped message. 3339 // If we do not have any messages, just update the metadata, otherwise 3340 // we will place an empty record marking the sequence as used. The 3341 // sequence will be marked erased. 3342 // fs lock should be held. 3343 func (mb *msgBlock) skipMsg(seq uint64, now time.Time) { 3344 if mb == nil { 3345 return 3346 } 3347 var needsRecord bool 3348 3349 nowts := now.UnixNano() 3350 3351 mb.mu.Lock() 3352 // If we are empty can just do meta. 3353 if mb.msgs == 0 { 3354 atomic.StoreUint64(&mb.last.seq, seq) 3355 mb.last.ts = nowts 3356 atomic.StoreUint64(&mb.first.seq, seq+1) 3357 mb.first.ts = nowts 3358 } else { 3359 needsRecord = true 3360 mb.dmap.Insert(seq) 3361 } 3362 mb.mu.Unlock() 3363 3364 if needsRecord { 3365 mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true) 3366 } else { 3367 mb.kickFlusher() 3368 } 3369 } 3370 3371 // SkipMsg will use the next sequence number but not store anything. 3372 func (fs *fileStore) SkipMsg() uint64 { 3373 fs.mu.Lock() 3374 defer fs.mu.Unlock() 3375 3376 // Grab our current last message block. 3377 mb := fs.lmb 3378 if mb == nil || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize { 3379 if mb != nil && fs.fcfg.Compression != NoCompression { 3380 // We've now reached the end of this message block, if we want 3381 // to compress blocks then now's the time to do it. 3382 go mb.recompressOnDiskIfNeeded() 3383 } 3384 var err error 3385 if mb, err = fs.newMsgBlockForWrite(); err != nil { 3386 return 0 3387 } 3388 } 3389 3390 // Grab time and last seq. 3391 now, seq := time.Now().UTC(), fs.state.LastSeq+1 3392 3393 // Write skip msg. 3394 mb.skipMsg(seq, now) 3395 3396 // Update fs state. 3397 fs.state.LastSeq, fs.state.LastTime = seq, now 3398 if fs.state.Msgs == 0 { 3399 fs.state.FirstSeq, fs.state.FirstTime = seq, now 3400 } 3401 if seq == fs.state.FirstSeq { 3402 fs.state.FirstSeq, fs.state.FirstTime = seq+1, now 3403 } 3404 // Mark as dirty for stream state. 3405 fs.dirty++ 3406 3407 return seq 3408 } 3409 3410 // Skip multiple msgs. We will determine if we can fit into current lmb or we need to create a new block. 3411 func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error { 3412 fs.mu.Lock() 3413 defer fs.mu.Unlock() 3414 3415 // Check sequence matches our last sequence. 3416 if seq != fs.state.LastSeq+1 { 3417 if seq > 0 { 3418 return ErrSequenceMismatch 3419 } 3420 seq = fs.state.LastSeq + 1 3421 } 3422 3423 // Limit number of dmap entries 3424 const maxDeletes = 64 * 1024 3425 mb := fs.lmb 3426 3427 numDeletes := int(num) 3428 if mb != nil { 3429 numDeletes += mb.dmap.Size() 3430 } 3431 if mb == nil || numDeletes > maxDeletes && mb.msgs > 0 || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize { 3432 if mb != nil && fs.fcfg.Compression != NoCompression { 3433 // We've now reached the end of this message block, if we want 3434 // to compress blocks then now's the time to do it. 3435 go mb.recompressOnDiskIfNeeded() 3436 } 3437 var err error 3438 if mb, err = fs.newMsgBlockForWrite(); err != nil { 3439 return err 3440 } 3441 } 3442 3443 // Insert into dmap all entries and place last as marker. 3444 now := time.Now().UTC() 3445 nowts := now.UnixNano() 3446 lseq := seq + num - 1 3447 3448 mb.mu.Lock() 3449 var needsRecord bool 3450 // If we are empty update meta directly. 3451 if mb.msgs == 0 { 3452 atomic.StoreUint64(&mb.last.seq, lseq) 3453 mb.last.ts = nowts 3454 atomic.StoreUint64(&mb.first.seq, lseq+1) 3455 mb.first.ts = nowts 3456 } else { 3457 needsRecord = true 3458 for ; seq <= lseq; seq++ { 3459 mb.dmap.Insert(seq) 3460 } 3461 } 3462 mb.mu.Unlock() 3463 3464 // Write out our placeholder. 3465 if needsRecord { 3466 mb.writeMsgRecord(emptyRecordLen, lseq|ebit, _EMPTY_, nil, nil, nowts, true) 3467 } 3468 3469 // Now update FS accounting. 3470 // Update fs state. 3471 fs.state.LastSeq, fs.state.LastTime = lseq, now 3472 if fs.state.Msgs == 0 { 3473 fs.state.FirstSeq, fs.state.FirstTime = lseq+1, now 3474 } 3475 3476 // Mark as dirty for stream state. 3477 fs.dirty++ 3478 3479 return nil 3480 } 3481 3482 // Lock should be held. 3483 func (fs *fileStore) rebuildFirst() { 3484 if len(fs.blks) == 0 { 3485 return 3486 } 3487 fmb := fs.blks[0] 3488 if fmb == nil { 3489 return 3490 } 3491 3492 ld, _, _ := fmb.rebuildState() 3493 fmb.mu.RLock() 3494 isEmpty := fmb.msgs == 0 3495 fmb.mu.RUnlock() 3496 if isEmpty { 3497 fmb.mu.Lock() 3498 fs.removeMsgBlock(fmb) 3499 fmb.mu.Unlock() 3500 } 3501 fs.selectNextFirst() 3502 fs.rebuildStateLocked(ld) 3503 } 3504 3505 // Optimized helper function to return first sequence. 3506 // subj will always be publish subject here, meaning non-wildcard. 3507 // We assume a fast check that this subj even exists already happened. 3508 // Lock should be held. 3509 func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) { 3510 if len(fs.blks) == 0 { 3511 return 0, nil 3512 } 3513 3514 // See if we can optimize where we start. 3515 start, stop := fs.blks[0].index, fs.lmb.index 3516 if info, ok := fs.psim.Find(stringToBytes(subj)); ok { 3517 start, stop = info.fblk, info.lblk 3518 } 3519 3520 for i := start; i <= stop; i++ { 3521 mb := fs.bim[i] 3522 if mb == nil { 3523 continue 3524 } 3525 mb.mu.Lock() 3526 var shouldExpire bool 3527 if mb.fssNotLoaded() { 3528 // Make sure we have fss loaded. 3529 if err := mb.loadMsgsWithLock(); err != nil { 3530 mb.mu.Unlock() 3531 return 0, err 3532 } 3533 shouldExpire = true 3534 } 3535 if ss := mb.fss[subj]; ss != nil { 3536 // Adjust first if it was not where we thought it should be. 3537 if i != start { 3538 if info, ok := fs.psim.Find(stringToBytes(subj)); ok { 3539 info.fblk = i 3540 } 3541 } 3542 if ss.firstNeedsUpdate { 3543 mb.recalculateFirstForSubj(subj, ss.First, ss) 3544 } 3545 mb.mu.Unlock() 3546 return ss.First, nil 3547 } 3548 // If we did not find it and we loaded this msgBlock try to expire as long as not the last. 3549 if shouldExpire { 3550 // Expire this cache before moving on. 3551 mb.tryForceExpireCacheLocked() 3552 } 3553 mb.mu.Unlock() 3554 } 3555 return 0, nil 3556 } 3557 3558 // Will check the msg limit and drop firstSeq msg if needed. 3559 // Lock should be held. 3560 func (fs *fileStore) enforceMsgLimit() { 3561 if fs.cfg.Discard != DiscardOld { 3562 return 3563 } 3564 if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) { 3565 return 3566 } 3567 for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs { 3568 if removed, err := fs.deleteFirstMsg(); err != nil || !removed { 3569 fs.rebuildFirst() 3570 return 3571 } 3572 } 3573 } 3574 3575 // Will check the bytes limit and drop msgs if needed. 3576 // Lock should be held. 3577 func (fs *fileStore) enforceBytesLimit() { 3578 if fs.cfg.Discard != DiscardOld { 3579 return 3580 } 3581 if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) { 3582 return 3583 } 3584 for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes { 3585 if removed, err := fs.deleteFirstMsg(); err != nil || !removed { 3586 fs.rebuildFirst() 3587 return 3588 } 3589 } 3590 } 3591 3592 // Will make sure we have limits honored for max msgs per subject on recovery or config update. 3593 // We will make sure to go through all msg blocks etc. but in practice this 3594 // will most likely only be the last one, so can take a more conservative approach. 3595 // Lock should be held. 3596 func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { 3597 maxMsgsPer := uint64(fs.cfg.MaxMsgsPer) 3598 3599 // We may want to suppress callbacks from remove during this process 3600 // since these should have already been deleted and accounted for. 3601 if !fireCallback { 3602 cb := fs.scb 3603 fs.scb = nil 3604 defer func() { fs.scb = cb }() 3605 } 3606 3607 var numMsgs uint64 3608 3609 // collect all that are not correct. 3610 needAttention := make(map[string]*psi) 3611 fs.psim.Iter(func(subj []byte, psi *psi) bool { 3612 numMsgs += psi.total 3613 if psi.total > maxMsgsPer { 3614 needAttention[string(subj)] = psi 3615 } 3616 return true 3617 }) 3618 3619 // We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught. 3620 // So do a quick sanity check here. If we detect a skew do a rebuild then re-check. 3621 if numMsgs != fs.state.Msgs { 3622 fs.warn("Detected skew in subject-based total (%d) vs raw total (%d), rebuilding", numMsgs, fs.state.Msgs) 3623 // Clear any global subject state. 3624 fs.psim, fs.tsl = fs.psim.Empty(), 0 3625 for _, mb := range fs.blks { 3626 ld, _, err := mb.rebuildState() 3627 if err != nil && ld != nil { 3628 fs.addLostData(ld) 3629 } 3630 fs.populateGlobalPerSubjectInfo(mb) 3631 } 3632 // Rebuild fs state too. 3633 fs.rebuildStateLocked(nil) 3634 // Need to redo blocks that need attention. 3635 needAttention = make(map[string]*psi) 3636 fs.psim.Iter(func(subj []byte, psi *psi) bool { 3637 if psi.total > maxMsgsPer { 3638 needAttention[string(subj)] = psi 3639 } 3640 return true 3641 }) 3642 } 3643 3644 // Collect all the msgBlks we alter. 3645 blks := make(map[*msgBlock]struct{}) 3646 3647 // For re-use below. 3648 var sm StoreMsg 3649 3650 // Walk all subjects that need attention here. 3651 for subj, info := range needAttention { 3652 total, start, stop := info.total, info.fblk, info.lblk 3653 3654 for i := start; i <= stop; i++ { 3655 mb := fs.bim[i] 3656 if mb == nil { 3657 continue 3658 } 3659 // Grab the ss entry for this subject in case sparse. 3660 mb.mu.Lock() 3661 mb.ensurePerSubjectInfoLoaded() 3662 ss := mb.fss[subj] 3663 if ss != nil && ss.firstNeedsUpdate { 3664 mb.recalculateFirstForSubj(subj, ss.First, ss) 3665 } 3666 mb.mu.Unlock() 3667 if ss == nil { 3668 continue 3669 } 3670 for seq := ss.First; seq <= ss.Last && total > maxMsgsPer; { 3671 m, _, err := mb.firstMatching(subj, false, seq, &sm) 3672 if err == nil { 3673 seq = m.seq + 1 3674 if removed, _ := fs.removeMsgViaLimits(m.seq); removed { 3675 total-- 3676 blks[mb] = struct{}{} 3677 } 3678 } else { 3679 // On error just do single increment. 3680 seq++ 3681 } 3682 } 3683 } 3684 } 3685 3686 // Expire the cache if we can. 3687 for mb := range blks { 3688 mb.mu.Lock() 3689 if mb.msgs > 0 { 3690 mb.tryForceExpireCacheLocked() 3691 } 3692 mb.mu.Unlock() 3693 } 3694 } 3695 3696 // Lock should be held. 3697 func (fs *fileStore) deleteFirstMsg() (bool, error) { 3698 return fs.removeMsgViaLimits(fs.state.FirstSeq) 3699 } 3700 3701 // If we remove via limits that can always be recovered on a restart we 3702 // do not force the system to update the index file. 3703 // Lock should be held. 3704 func (fs *fileStore) removeMsgViaLimits(seq uint64) (bool, error) { 3705 return fs.removeMsg(seq, false, true, false) 3706 } 3707 3708 // RemoveMsg will remove the message from this store. 3709 // Will return the number of bytes removed. 3710 func (fs *fileStore) RemoveMsg(seq uint64) (bool, error) { 3711 return fs.removeMsg(seq, false, false, true) 3712 } 3713 3714 func (fs *fileStore) EraseMsg(seq uint64) (bool, error) { 3715 return fs.removeMsg(seq, true, false, true) 3716 } 3717 3718 // Convenience function to remove per subject tracking at the filestore level. 3719 // Lock should be held. 3720 func (fs *fileStore) removePerSubject(subj string) { 3721 if len(subj) == 0 || fs.psim == nil { 3722 return 3723 } 3724 // We do not update sense of fblk here but will do so when we resolve during lookup. 3725 bsubj := stringToBytes(subj) 3726 if info, ok := fs.psim.Find(bsubj); ok { 3727 info.total-- 3728 if info.total == 1 { 3729 info.fblk = info.lblk 3730 } else if info.total == 0 { 3731 if _, ok = fs.psim.Delete(bsubj); ok { 3732 fs.tsl -= len(subj) 3733 } 3734 } 3735 } 3736 } 3737 3738 // Remove a message, optionally rewriting the mb file. 3739 func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) (bool, error) { 3740 if seq == 0 { 3741 return false, ErrStoreMsgNotFound 3742 } 3743 fsLock := func() { 3744 if needFSLock { 3745 fs.mu.Lock() 3746 } 3747 } 3748 fsUnlock := func() { 3749 if needFSLock { 3750 fs.mu.Unlock() 3751 } 3752 } 3753 3754 fsLock() 3755 3756 if fs.closed { 3757 fsUnlock() 3758 return false, ErrStoreClosed 3759 } 3760 if !viaLimits && fs.sips > 0 { 3761 fsUnlock() 3762 return false, ErrStoreSnapshotInProgress 3763 } 3764 // If in encrypted mode negate secure rewrite here. 3765 if secure && fs.prf != nil { 3766 secure = false 3767 } 3768 3769 if fs.state.Msgs == 0 { 3770 var err = ErrStoreEOF 3771 if seq <= fs.state.LastSeq { 3772 err = ErrStoreMsgNotFound 3773 } 3774 fsUnlock() 3775 return false, err 3776 } 3777 3778 mb := fs.selectMsgBlock(seq) 3779 if mb == nil { 3780 var err = ErrStoreEOF 3781 if seq <= fs.state.LastSeq { 3782 err = ErrStoreMsgNotFound 3783 } 3784 fsUnlock() 3785 return false, err 3786 } 3787 3788 mb.mu.Lock() 3789 3790 // See if we are closed or the sequence number is still relevant. 3791 if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) { 3792 mb.mu.Unlock() 3793 fsUnlock() 3794 return false, nil 3795 } 3796 3797 // Now check dmap if it is there. 3798 if mb.dmap.Exists(seq) { 3799 mb.mu.Unlock() 3800 fsUnlock() 3801 return false, nil 3802 } 3803 3804 // We used to not have to load in the messages except with callbacks or the filtered subject state (which is now always on). 3805 // Now just load regardless. 3806 // TODO(dlc) - Figure out a way not to have to load it in, we need subject tracking outside main data block. 3807 if mb.cacheNotLoaded() { 3808 // We do not want to block possible activity within another msg block. 3809 // We have to unlock both locks and acquire the mb lock in the loadMsgs() call to avoid a deadlock if another 3810 // go routine was trying to get fs then this mb lock at the same time. E.g. another call to remove for same block. 3811 mb.mu.Unlock() 3812 fsUnlock() 3813 if err := mb.loadMsgs(); err != nil { 3814 return false, err 3815 } 3816 fsLock() 3817 // We need to check if things changed out from underneath us. 3818 if fs.closed { 3819 fsUnlock() 3820 return false, ErrStoreClosed 3821 } 3822 mb.mu.Lock() 3823 if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) { 3824 mb.mu.Unlock() 3825 fsUnlock() 3826 return false, nil 3827 } 3828 // cacheLookup below will do dmap check so no need to repeat here. 3829 } 3830 3831 var smv StoreMsg 3832 sm, err := mb.cacheLookup(seq, &smv) 3833 if err != nil { 3834 mb.mu.Unlock() 3835 fsUnlock() 3836 // Mimic err behavior from above check to dmap. No error returned if already removed. 3837 if err == errDeletedMsg { 3838 err = nil 3839 } 3840 return false, err 3841 } 3842 // Grab size 3843 msz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 3844 3845 // Set cache timestamp for last remove. 3846 mb.lrts = time.Now().UnixNano() 3847 3848 // Global stats 3849 if fs.state.Msgs > 0 { 3850 fs.state.Msgs-- 3851 } 3852 if msz < fs.state.Bytes { 3853 fs.state.Bytes -= msz 3854 } else { 3855 fs.state.Bytes = 0 3856 } 3857 3858 // Now local mb updates. 3859 if mb.msgs > 0 { 3860 mb.msgs-- 3861 } 3862 if msz < mb.bytes { 3863 mb.bytes -= msz 3864 } else { 3865 mb.bytes = 0 3866 } 3867 3868 // Mark as dirty for stream state. 3869 fs.dirty++ 3870 3871 // If we are tracking subjects here make sure we update that accounting. 3872 mb.ensurePerSubjectInfoLoaded() 3873 3874 // If we are tracking multiple subjects here make sure we update that accounting. 3875 mb.removeSeqPerSubject(sm.subj, seq) 3876 fs.removePerSubject(sm.subj) 3877 3878 if secure { 3879 // Grab record info. 3880 ri, rl, _, _ := mb.slotInfo(int(seq - mb.cache.fseq)) 3881 mb.eraseMsg(seq, int(ri), int(rl)) 3882 } 3883 3884 fifo := seq == atomic.LoadUint64(&mb.first.seq) 3885 isLastBlock := mb == fs.lmb 3886 isEmpty := mb.msgs == 0 3887 3888 if fifo { 3889 mb.selectNextFirst() 3890 if !isEmpty { 3891 // Can update this one in place. 3892 if seq == fs.state.FirstSeq { 3893 fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one. 3894 if mb.first.ts == 0 { 3895 fs.state.FirstTime = time.Time{} 3896 } else { 3897 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 3898 } 3899 } 3900 } 3901 } else if !isEmpty { 3902 // Out of order delete. 3903 mb.dmap.Insert(seq) 3904 // Make simple check here similar to Compact(). If we can save 50% and over a certain threshold do inline. 3905 // All other more thorough cleanup will happen in syncBlocks logic. 3906 // Note that we do not have to store empty records for the deleted, so don't use to calculate. 3907 // TODO(dlc) - This should not be inline, should kick the sync routine. 3908 if mb.rbytes > compactMinimum && mb.bytes*2 < mb.rbytes && !isLastBlock { 3909 mb.compact() 3910 fs.kickFlushStateLoop() 3911 } 3912 } 3913 3914 if secure { 3915 if ld, _ := mb.flushPendingMsgsLocked(); ld != nil { 3916 // We have the mb lock here, this needs the mb locks so do in its own go routine. 3917 go fs.rebuildState(ld) 3918 } 3919 } 3920 3921 // If empty remove this block and check if we need to update first sequence. 3922 // We will write a tombstone at the end. 3923 var firstSeqNeedsUpdate bool 3924 if isEmpty { 3925 // This writes tombstone iff mb == lmb, so no need to do below. 3926 fs.removeMsgBlock(mb) 3927 firstSeqNeedsUpdate = seq == fs.state.FirstSeq 3928 } 3929 mb.mu.Unlock() 3930 3931 // If we emptied the current message block and the seq was state.FirstSeq 3932 // then we need to jump message blocks. We will also write the index so 3933 // we don't lose track of the first sequence. 3934 if firstSeqNeedsUpdate { 3935 fs.selectNextFirst() 3936 } 3937 3938 // Check if we need to write a deleted record tombstone. 3939 // This is for user initiated removes or to hold the first seq 3940 // when the last block is empty. 3941 3942 // If not via limits and not empty and last (empty writes tombstone above if last) write tombstone. 3943 if !viaLimits && !(isEmpty && isLastBlock) { 3944 if lmb := fs.lmb; sm != nil && lmb != nil { 3945 lmb.writeTombstone(sm.seq, sm.ts) 3946 } 3947 } 3948 3949 if cb := fs.scb; cb != nil { 3950 // If we have a callback registered we need to release lock regardless since cb might need it to lookup msg, etc. 3951 fs.mu.Unlock() 3952 // Storage updates. 3953 var subj string 3954 if sm != nil { 3955 subj = sm.subj 3956 } 3957 delta := int64(msz) 3958 cb(-1, -delta, seq, subj) 3959 3960 if !needFSLock { 3961 fs.mu.Lock() 3962 } 3963 } else if needFSLock { 3964 // We acquired it so release it. 3965 fs.mu.Unlock() 3966 } 3967 3968 return true, nil 3969 } 3970 3971 // This will compact and rewrite this block. This should only be called when we know we want to rewrite this block. 3972 // This should not be called on the lmb since we will prune tail deleted messages which could cause issues with 3973 // writing new messages. We will silently bail on any issues with the underlying block and let someone else detect. 3974 // Write lock needs to be held. 3975 func (mb *msgBlock) compact() { 3976 wasLoaded := mb.cacheAlreadyLoaded() 3977 if !wasLoaded { 3978 if err := mb.loadMsgsWithLock(); err != nil { 3979 return 3980 } 3981 } 3982 3983 buf := mb.cache.buf 3984 nbuf := getMsgBlockBuf(len(buf)) 3985 // Recycle our nbuf when we are done. 3986 defer recycleMsgBlockBuf(nbuf) 3987 3988 var le = binary.LittleEndian 3989 var firstSet bool 3990 3991 fseq := atomic.LoadUint64(&mb.first.seq) 3992 isDeleted := func(seq uint64) bool { 3993 return seq == 0 || seq&ebit != 0 || mb.dmap.Exists(seq) || seq < fseq 3994 } 3995 3996 for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { 3997 if index+msgHdrSize > lbuf { 3998 return 3999 } 4000 hdr := buf[index : index+msgHdrSize] 4001 rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:]) 4002 // Clear any headers bit that could be set. 4003 rl &^= hbit 4004 dlen := int(rl) - msgHdrSize 4005 // Do some quick sanity checks here. 4006 if dlen < 0 || int(slen) > dlen || dlen > int(rl) || rl > rlBadThresh || index+rl > lbuf { 4007 return 4008 } 4009 // Only need to process non-deleted messages. 4010 seq := le.Uint64(hdr[4:]) 4011 4012 if !isDeleted(seq) { 4013 // Check for tombstones. 4014 if seq&tbit != 0 { 4015 // If we are last mb we should consider to keep these unless the tombstone reflects a seq in this mb. 4016 if mb == mb.fs.lmb && seq < fseq { 4017 nbuf = append(nbuf, buf[index:index+rl]...) 4018 } 4019 } else { 4020 // Normal message here. 4021 nbuf = append(nbuf, buf[index:index+rl]...) 4022 if !firstSet { 4023 firstSet = true 4024 atomic.StoreUint64(&mb.first.seq, seq) 4025 } 4026 } 4027 } 4028 // Advance to next record. 4029 index += rl 4030 } 4031 4032 // Handle compression 4033 if mb.cmp != NoCompression { 4034 cbuf, err := mb.cmp.Compress(nbuf) 4035 if err != nil { 4036 return 4037 } 4038 meta := &CompressionInfo{ 4039 Algorithm: mb.cmp, 4040 OriginalSize: uint64(len(nbuf)), 4041 } 4042 nbuf = append(meta.MarshalMetadata(), cbuf...) 4043 } 4044 4045 // Check for encryption. 4046 if mb.bek != nil && len(nbuf) > 0 { 4047 // Recreate to reset counter. 4048 rbek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 4049 if err != nil { 4050 return 4051 } 4052 rbek.XORKeyStream(nbuf, nbuf) 4053 } 4054 4055 // Close FDs first. 4056 mb.closeFDsLocked() 4057 4058 // We will write to a new file and mv/rename it in case of failure. 4059 mfn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(newScan, mb.index)) 4060 <-dios 4061 err := os.WriteFile(mfn, nbuf, defaultFilePerms) 4062 dios <- struct{}{} 4063 if err != nil { 4064 os.Remove(mfn) 4065 return 4066 } 4067 if err := os.Rename(mfn, mb.mfn); err != nil { 4068 os.Remove(mfn) 4069 return 4070 } 4071 4072 // Capture the updated rbytes. 4073 mb.rbytes = uint64(len(nbuf)) 4074 4075 // Remove any seqs from the beginning of the blk. 4076 for seq, nfseq := fseq, atomic.LoadUint64(&mb.first.seq); seq < nfseq; seq++ { 4077 mb.dmap.Delete(seq) 4078 } 4079 // Make sure we clear the cache since no longer valid. 4080 mb.clearCacheAndOffset() 4081 // If we entered with the msgs loaded make sure to reload them. 4082 if wasLoaded { 4083 mb.loadMsgsWithLock() 4084 } 4085 } 4086 4087 // Grab info from a slot. 4088 // Lock should be held. 4089 func (mb *msgBlock) slotInfo(slot int) (uint32, uint32, bool, error) { 4090 if mb.cache == nil || slot >= len(mb.cache.idx) { 4091 return 0, 0, false, errPartialCache 4092 } 4093 4094 bi := mb.cache.idx[slot] 4095 ri, hashChecked := (bi &^ hbit), (bi&hbit) != 0 4096 4097 // If this is a deleted slot return here. 4098 if bi == dbit { 4099 return 0, 0, false, errDeletedMsg 4100 } 4101 4102 // Determine record length 4103 var rl uint32 4104 if slot >= len(mb.cache.idx) { 4105 rl = mb.cache.lrl 4106 } else { 4107 // Need to account for dbit markers in idx. 4108 // So we will walk until we find valid idx slot to calculate rl. 4109 for i := 1; slot+i < len(mb.cache.idx); i++ { 4110 ni := mb.cache.idx[slot+i] &^ hbit 4111 if ni == dbit { 4112 continue 4113 } 4114 rl = ni - ri 4115 break 4116 } 4117 // check if we had all trailing dbits. 4118 // If so use len of cache buf minus ri. 4119 if rl == 0 { 4120 rl = uint32(len(mb.cache.buf)) - ri 4121 } 4122 } 4123 if rl < msgHdrSize { 4124 return 0, 0, false, errBadMsg 4125 } 4126 return uint32(ri), rl, hashChecked, nil 4127 } 4128 4129 func (fs *fileStore) isClosed() bool { 4130 fs.mu.RLock() 4131 closed := fs.closed 4132 fs.mu.RUnlock() 4133 return closed 4134 } 4135 4136 // Will spin up our flush loop. 4137 func (mb *msgBlock) spinUpFlushLoop() { 4138 mb.mu.Lock() 4139 defer mb.mu.Unlock() 4140 4141 // Are we already running or closed? 4142 if mb.flusher || mb.closed { 4143 return 4144 } 4145 mb.flusher = true 4146 mb.fch = make(chan struct{}, 1) 4147 mb.qch = make(chan struct{}) 4148 fch, qch := mb.fch, mb.qch 4149 4150 go mb.flushLoop(fch, qch) 4151 } 4152 4153 // Raw low level kicker for flush loops. 4154 func kickFlusher(fch chan struct{}) { 4155 if fch != nil { 4156 select { 4157 case fch <- struct{}{}: 4158 default: 4159 } 4160 } 4161 } 4162 4163 // Kick flusher for this message block. 4164 func (mb *msgBlock) kickFlusher() { 4165 mb.mu.RLock() 4166 defer mb.mu.RUnlock() 4167 kickFlusher(mb.fch) 4168 } 4169 4170 func (mb *msgBlock) setInFlusher() { 4171 mb.mu.Lock() 4172 mb.flusher = true 4173 mb.mu.Unlock() 4174 } 4175 4176 func (mb *msgBlock) clearInFlusher() { 4177 mb.mu.Lock() 4178 mb.flusher = false 4179 mb.mu.Unlock() 4180 } 4181 4182 // flushLoop watches for messages, index info, or recently closed msg block updates. 4183 func (mb *msgBlock) flushLoop(fch, qch chan struct{}) { 4184 mb.setInFlusher() 4185 defer mb.clearInFlusher() 4186 4187 for { 4188 select { 4189 case <-fch: 4190 // If we have pending messages process them first. 4191 if waiting := mb.pendingWriteSize(); waiting != 0 { 4192 ts := 1 * time.Millisecond 4193 var waited time.Duration 4194 4195 for waiting < coalesceMinimum { 4196 time.Sleep(ts) 4197 select { 4198 case <-qch: 4199 return 4200 default: 4201 } 4202 newWaiting := mb.pendingWriteSize() 4203 if waited = waited + ts; waited > maxFlushWait || newWaiting <= waiting { 4204 break 4205 } 4206 waiting = newWaiting 4207 ts *= 2 4208 } 4209 mb.flushPendingMsgs() 4210 // Check if we are no longer the last message block. If we are 4211 // not we can close FDs and exit. 4212 mb.fs.mu.RLock() 4213 notLast := mb != mb.fs.lmb 4214 mb.fs.mu.RUnlock() 4215 if notLast { 4216 if err := mb.closeFDs(); err == nil { 4217 return 4218 } 4219 } 4220 } 4221 case <-qch: 4222 return 4223 } 4224 } 4225 } 4226 4227 // Lock should be held. 4228 func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error { 4229 var le = binary.LittleEndian 4230 var hdr [msgHdrSize]byte 4231 4232 le.PutUint32(hdr[0:], uint32(rl)) 4233 le.PutUint64(hdr[4:], seq|ebit) 4234 le.PutUint64(hdr[12:], 0) 4235 le.PutUint16(hdr[20:], 0) 4236 4237 // Randomize record 4238 data := make([]byte, rl-emptyRecordLen) 4239 rand.Read(data) 4240 4241 // Now write to underlying buffer. 4242 var b bytes.Buffer 4243 b.Write(hdr[:]) 4244 b.Write(data) 4245 4246 // Calculate hash. 4247 mb.hh.Reset() 4248 mb.hh.Write(hdr[4:20]) 4249 mb.hh.Write(data) 4250 checksum := mb.hh.Sum(nil) 4251 // Write to msg record. 4252 b.Write(checksum) 4253 4254 // Update both cache and disk. 4255 nbytes := b.Bytes() 4256 4257 // Cache 4258 if ri >= mb.cache.off { 4259 li := ri - mb.cache.off 4260 buf := mb.cache.buf[li : li+rl] 4261 copy(buf, nbytes) 4262 } 4263 4264 // Disk 4265 if mb.cache.off+mb.cache.wp > ri { 4266 <-dios 4267 mfd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms) 4268 dios <- struct{}{} 4269 if err != nil { 4270 return err 4271 } 4272 defer mfd.Close() 4273 if _, err = mfd.WriteAt(nbytes, int64(ri)); err == nil { 4274 mfd.Sync() 4275 } 4276 if err != nil { 4277 return err 4278 } 4279 } 4280 return nil 4281 } 4282 4283 // Truncate this message block to the storedMsg. 4284 func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) { 4285 // Make sure we are loaded to process messages etc. 4286 if err := mb.loadMsgs(); err != nil { 4287 return 0, 0, err 4288 } 4289 4290 // Calculate new eof using slot info from our new last sm. 4291 ri, rl, _, err := mb.slotInfo(int(sm.seq - mb.cache.fseq)) 4292 if err != nil { 4293 return 0, 0, err 4294 } 4295 // Calculate new eof. 4296 eof := int64(ri + rl) 4297 4298 var purged, bytes uint64 4299 4300 mb.mu.Lock() 4301 4302 checkDmap := mb.dmap.Size() > 0 4303 var smv StoreMsg 4304 4305 for seq := atomic.LoadUint64(&mb.last.seq); seq > sm.seq; seq-- { 4306 if checkDmap { 4307 if mb.dmap.Exists(seq) { 4308 // Delete and skip to next. 4309 mb.dmap.Delete(seq) 4310 checkDmap = !mb.dmap.IsEmpty() 4311 continue 4312 } 4313 } 4314 // We should have a valid msg to calculate removal stats. 4315 if m, err := mb.cacheLookup(seq, &smv); err == nil { 4316 if mb.msgs > 0 { 4317 rl := fileStoreMsgSize(m.subj, m.hdr, m.msg) 4318 mb.msgs-- 4319 if rl > mb.bytes { 4320 rl = mb.bytes 4321 } 4322 mb.bytes -= rl 4323 mb.rbytes -= rl 4324 // For return accounting. 4325 purged++ 4326 bytes += uint64(rl) 4327 } 4328 } 4329 } 4330 4331 // If the block is compressed then we have to load it into memory 4332 // and decompress it, truncate it and then write it back out. 4333 // Otherwise, truncate the file itself and close the descriptor. 4334 if mb.cmp != NoCompression { 4335 buf, err := mb.loadBlock(nil) 4336 if err != nil { 4337 return 0, 0, fmt.Errorf("failed to load block from disk: %w", err) 4338 } 4339 if mb.bek != nil && len(buf) > 0 { 4340 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 4341 if err != nil { 4342 return 0, 0, err 4343 } 4344 mb.bek = bek 4345 mb.bek.XORKeyStream(buf, buf) 4346 } 4347 buf, err = mb.decompressIfNeeded(buf) 4348 if err != nil { 4349 return 0, 0, fmt.Errorf("failed to decompress block: %w", err) 4350 } 4351 buf = buf[:eof] 4352 copy(mb.lchk[0:], buf[:len(buf)-checksumSize]) 4353 buf, err = mb.cmp.Compress(buf) 4354 if err != nil { 4355 return 0, 0, fmt.Errorf("failed to recompress block: %w", err) 4356 } 4357 meta := &CompressionInfo{ 4358 Algorithm: mb.cmp, 4359 OriginalSize: uint64(eof), 4360 } 4361 buf = append(meta.MarshalMetadata(), buf...) 4362 if mb.bek != nil && len(buf) > 0 { 4363 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 4364 if err != nil { 4365 return 0, 0, err 4366 } 4367 mb.bek = bek 4368 mb.bek.XORKeyStream(buf, buf) 4369 } 4370 n, err := mb.writeAt(buf, 0) 4371 if err != nil { 4372 return 0, 0, fmt.Errorf("failed to rewrite compressed block: %w", err) 4373 } 4374 if n != len(buf) { 4375 return 0, 0, fmt.Errorf("short write (%d != %d)", n, len(buf)) 4376 } 4377 mb.mfd.Truncate(int64(len(buf))) 4378 mb.mfd.Sync() 4379 } else if mb.mfd != nil { 4380 mb.mfd.Truncate(eof) 4381 mb.mfd.Sync() 4382 // Update our checksum. 4383 var lchk [8]byte 4384 mb.mfd.ReadAt(lchk[:], eof-8) 4385 copy(mb.lchk[0:], lchk[:]) 4386 } else { 4387 mb.mu.Unlock() 4388 return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index) 4389 } 4390 4391 // Update our last msg. 4392 atomic.StoreUint64(&mb.last.seq, sm.seq) 4393 mb.last.ts = sm.ts 4394 4395 // Clear our cache. 4396 mb.clearCacheAndOffset() 4397 4398 // Redo per subject info for this block. 4399 mb.resetPerSubjectInfo() 4400 4401 mb.mu.Unlock() 4402 4403 // Load msgs again. 4404 mb.loadMsgs() 4405 4406 return purged, bytes, nil 4407 } 4408 4409 // Helper to determine if the mb is empty. 4410 func (mb *msgBlock) isEmpty() bool { 4411 return atomic.LoadUint64(&mb.first.seq) > atomic.LoadUint64(&mb.last.seq) 4412 } 4413 4414 // Lock should be held. 4415 func (mb *msgBlock) selectNextFirst() { 4416 var seq uint64 4417 fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 4418 for seq = fseq + 1; seq <= lseq; seq++ { 4419 if mb.dmap.Exists(seq) { 4420 // We will move past this so we can delete the entry. 4421 mb.dmap.Delete(seq) 4422 } else { 4423 break 4424 } 4425 } 4426 // Set new first sequence. 4427 atomic.StoreUint64(&mb.first.seq, seq) 4428 4429 // Check if we are empty.. 4430 if seq > lseq { 4431 mb.first.ts = 0 4432 return 4433 } 4434 4435 // Need to get the timestamp. 4436 // We will try the cache direct and fallback if needed. 4437 var smv StoreMsg 4438 sm, _ := mb.cacheLookup(seq, &smv) 4439 if sm == nil { 4440 // Slow path, need to unlock. 4441 mb.mu.Unlock() 4442 sm, _, _ = mb.fetchMsg(seq, &smv) 4443 mb.mu.Lock() 4444 } 4445 if sm != nil { 4446 mb.first.ts = sm.ts 4447 } else { 4448 mb.first.ts = 0 4449 } 4450 } 4451 4452 // Select the next FirstSeq 4453 // Lock should be held. 4454 func (fs *fileStore) selectNextFirst() { 4455 if len(fs.blks) > 0 { 4456 mb := fs.blks[0] 4457 mb.mu.RLock() 4458 fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) 4459 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 4460 mb.mu.RUnlock() 4461 } else { 4462 // Could not find anything, so treat like purge 4463 fs.state.FirstSeq = fs.state.LastSeq + 1 4464 fs.state.FirstTime = time.Time{} 4465 } 4466 } 4467 4468 // Lock should be held. 4469 func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) { 4470 if td == 0 { 4471 td = mb.cexp + 100*time.Millisecond 4472 } 4473 if mb.ctmr == nil { 4474 mb.ctmr = time.AfterFunc(td, mb.expireCache) 4475 } else { 4476 mb.ctmr.Reset(td) 4477 } 4478 } 4479 4480 // Lock should be held. 4481 func (mb *msgBlock) startCacheExpireTimer() { 4482 mb.resetCacheExpireTimer(0) 4483 } 4484 4485 // Used when we load in a message block. 4486 // Lock should be held. 4487 func (mb *msgBlock) clearCacheAndOffset() { 4488 // Reset linear scan tracker. 4489 mb.llseq = 0 4490 if mb.cache != nil { 4491 mb.cache.off = 0 4492 mb.cache.wp = 0 4493 } 4494 mb.clearCache() 4495 } 4496 4497 // Lock should be held. 4498 func (mb *msgBlock) clearCache() { 4499 if mb.ctmr != nil && mb.fss == nil { 4500 mb.ctmr.Stop() 4501 mb.ctmr = nil 4502 } 4503 4504 if mb.cache == nil { 4505 return 4506 } 4507 4508 buf := mb.cache.buf 4509 if mb.cache.off == 0 { 4510 mb.cache = nil 4511 } else { 4512 // Clear msgs and index. 4513 mb.cache.buf = nil 4514 mb.cache.idx = nil 4515 mb.cache.wp = 0 4516 } 4517 recycleMsgBlockBuf(buf) 4518 } 4519 4520 // Called to possibly expire a message block cache. 4521 func (mb *msgBlock) expireCache() { 4522 mb.mu.Lock() 4523 defer mb.mu.Unlock() 4524 mb.expireCacheLocked() 4525 } 4526 4527 func (mb *msgBlock) tryForceExpireCache() { 4528 mb.mu.Lock() 4529 defer mb.mu.Unlock() 4530 mb.tryForceExpireCacheLocked() 4531 } 4532 4533 // We will attempt to force expire this by temporarily clearing the last load time. 4534 func (mb *msgBlock) tryForceExpireCacheLocked() { 4535 llts := mb.llts 4536 mb.llts = 0 4537 mb.expireCacheLocked() 4538 mb.llts = llts 4539 } 4540 4541 // This is for expiration of the write cache, which will be partial with fip. 4542 // So we want to bypass the Pools here. 4543 // Lock should be held. 4544 func (mb *msgBlock) tryExpireWriteCache() []byte { 4545 if mb.cache == nil { 4546 return nil 4547 } 4548 lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra 4549 mb.lwts, mb.cache.nra = 0, true 4550 mb.expireCacheLocked() 4551 mb.lwts = lwts 4552 if mb.cache != nil { 4553 mb.cache.nra = nra 4554 } 4555 // We could check for a certain time since last load, but to be safe just reuse if no loads at all. 4556 if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) { 4557 // Clear last write time since we now are about to move on to a new lmb. 4558 mb.lwts = 0 4559 return buf[:0] 4560 } 4561 return nil 4562 } 4563 4564 // Lock should be held. 4565 func (mb *msgBlock) expireCacheLocked() { 4566 if mb.cache == nil { 4567 if mb.ctmr != nil { 4568 mb.ctmr.Stop() 4569 mb.ctmr = nil 4570 } 4571 return 4572 } 4573 4574 // Can't expire if we still have pending. 4575 if mb.cache != nil && len(mb.cache.buf)-int(mb.cache.wp) > 0 { 4576 mb.resetCacheExpireTimer(mb.cexp) 4577 return 4578 } 4579 4580 // Grab timestamp to compare. 4581 tns := time.Now().UnixNano() 4582 4583 // For the core buffer of messages, we care about reads and writes, but not removes. 4584 bufts := mb.llts 4585 if mb.lwts > bufts { 4586 bufts = mb.lwts 4587 } 4588 4589 // Check for activity on the cache that would prevent us from expiring. 4590 if tns-bufts <= int64(mb.cexp) { 4591 mb.resetCacheExpireTimer(mb.cexp - time.Duration(tns-bufts)) 4592 return 4593 } 4594 4595 // If we are here we will at least expire the core msg buffer. 4596 // We need to capture offset in case we do a write next before a full load. 4597 if mb.cache != nil { 4598 mb.cache.off += len(mb.cache.buf) 4599 if !mb.cache.nra { 4600 recycleMsgBlockBuf(mb.cache.buf) 4601 } 4602 mb.cache.buf = nil 4603 mb.cache.wp = 0 4604 } 4605 4606 // Check if we can clear out our idx unless under force expire. 4607 // fss we keep longer and expire under sync timer checks. 4608 mb.clearCache() 4609 } 4610 4611 func (fs *fileStore) startAgeChk() { 4612 if fs.ageChk == nil && fs.cfg.MaxAge != 0 { 4613 fs.ageChk = time.AfterFunc(fs.cfg.MaxAge, fs.expireMsgs) 4614 } 4615 } 4616 4617 // Lock should be held. 4618 func (fs *fileStore) resetAgeChk(delta int64) { 4619 if fs.cfg.MaxAge == 0 { 4620 return 4621 } 4622 4623 fireIn := fs.cfg.MaxAge 4624 if delta > 0 && time.Duration(delta) < fireIn { 4625 if fireIn = time.Duration(delta); fireIn < time.Second { 4626 // Only fire at most once a second. 4627 // Excessive firing can effect ingest performance. 4628 fireIn = time.Second 4629 } 4630 } 4631 if fs.ageChk != nil { 4632 fs.ageChk.Reset(fireIn) 4633 } else { 4634 fs.ageChk = time.AfterFunc(fireIn, fs.expireMsgs) 4635 } 4636 } 4637 4638 // Lock should be held. 4639 func (fs *fileStore) cancelAgeChk() { 4640 if fs.ageChk != nil { 4641 fs.ageChk.Stop() 4642 fs.ageChk = nil 4643 } 4644 } 4645 4646 // Will expire msgs that are too old. 4647 func (fs *fileStore) expireMsgs() { 4648 // We need to delete one by one here and can not optimize for the time being. 4649 // Reason is that we need more information to adjust ack pending in consumers. 4650 var smv StoreMsg 4651 var sm *StoreMsg 4652 fs.mu.RLock() 4653 maxAge := int64(fs.cfg.MaxAge) 4654 minAge := time.Now().UnixNano() - maxAge 4655 fs.mu.RUnlock() 4656 4657 for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) { 4658 fs.mu.Lock() 4659 fs.removeMsgViaLimits(sm.seq) 4660 fs.mu.Unlock() 4661 // Recalculate in case we are expiring a bunch. 4662 minAge = time.Now().UnixNano() - maxAge 4663 } 4664 4665 fs.mu.Lock() 4666 defer fs.mu.Unlock() 4667 4668 // Onky cancel if no message left, not on potential lookup error that would result in sm == nil. 4669 if fs.state.Msgs == 0 { 4670 fs.cancelAgeChk() 4671 } else { 4672 if sm == nil { 4673 fs.resetAgeChk(0) 4674 } else { 4675 fs.resetAgeChk(sm.ts - minAge) 4676 } 4677 } 4678 } 4679 4680 // Lock should be held. 4681 func (fs *fileStore) checkAndFlushAllBlocks() { 4682 for _, mb := range fs.blks { 4683 if mb.pendingWriteSize() > 0 { 4684 // Since fs lock is held need to pull this apart in case we need to rebuild state. 4685 mb.mu.Lock() 4686 ld, _ := mb.flushPendingMsgsLocked() 4687 mb.mu.Unlock() 4688 if ld != nil { 4689 fs.rebuildStateLocked(ld) 4690 } 4691 } 4692 } 4693 } 4694 4695 // This will check all the checksums on messages and report back any sequence numbers with errors. 4696 func (fs *fileStore) checkMsgs() *LostStreamData { 4697 fs.mu.Lock() 4698 defer fs.mu.Unlock() 4699 4700 fs.checkAndFlushAllBlocks() 4701 4702 // Clear any global subject state. 4703 fs.psim, fs.tsl = fs.psim.Empty(), 0 4704 4705 for _, mb := range fs.blks { 4706 // Make sure encryption loaded if needed for the block. 4707 fs.loadEncryptionForMsgBlock(mb) 4708 // FIXME(dlc) - check tombstones here too? 4709 if ld, _, err := mb.rebuildState(); err != nil && ld != nil { 4710 // Rebuild fs state too. 4711 fs.rebuildStateLocked(ld) 4712 } 4713 fs.populateGlobalPerSubjectInfo(mb) 4714 } 4715 4716 return fs.ld 4717 } 4718 4719 // Lock should be held. 4720 func (mb *msgBlock) enableForWriting(fip bool) error { 4721 if mb == nil { 4722 return errNoMsgBlk 4723 } 4724 if mb.mfd != nil { 4725 return nil 4726 } 4727 <-dios 4728 mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms) 4729 dios <- struct{}{} 4730 if err != nil { 4731 return fmt.Errorf("error opening msg block file [%q]: %v", mb.mfn, err) 4732 } 4733 mb.mfd = mfd 4734 4735 // Spin up our flusher loop if needed. 4736 if !fip { 4737 mb.spinUpFlushLoop() 4738 } 4739 4740 return nil 4741 } 4742 4743 // Helper function to place a delete tombstone. 4744 func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error { 4745 return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true) 4746 } 4747 4748 // Will write the message record to the underlying message block. 4749 // filestore lock will be held. 4750 func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error { 4751 mb.mu.Lock() 4752 defer mb.mu.Unlock() 4753 4754 // Enable for writing if our mfd is not open. 4755 if mb.mfd == nil { 4756 if err := mb.enableForWriting(flush); err != nil { 4757 return err 4758 } 4759 } 4760 4761 // Make sure we have a cache setup. 4762 if mb.cache == nil { 4763 mb.setupWriteCache(nil) 4764 } 4765 4766 // Check if we are tracking per subject for our simple state. 4767 // Do this before changing the cache that would trigger a flush pending msgs call 4768 // if we needed to regenerate the per subject info. 4769 // Note that tombstones have no subject so will not trigger here. 4770 if len(subj) > 0 && !mb.noTrack { 4771 if err := mb.ensurePerSubjectInfoLoaded(); err != nil { 4772 return err 4773 } 4774 if ss := mb.fss[subj]; ss != nil { 4775 ss.Msgs++ 4776 ss.Last = seq 4777 } else { 4778 mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} 4779 } 4780 } 4781 4782 // Indexing 4783 index := len(mb.cache.buf) + int(mb.cache.off) 4784 4785 // Formats 4786 // Format with no header 4787 // total_len(4) sequence(8) timestamp(8) subj_len(2) subj msg hash(8) 4788 // With headers, high bit on total length will be set. 4789 // total_len(4) sequence(8) timestamp(8) subj_len(2) subj hdr_len(4) hdr msg hash(8) 4790 4791 // First write header, etc. 4792 var le = binary.LittleEndian 4793 var hdr [msgHdrSize]byte 4794 4795 l := uint32(rl) 4796 hasHeaders := len(mhdr) > 0 4797 if hasHeaders { 4798 l |= hbit 4799 } 4800 4801 le.PutUint32(hdr[0:], l) 4802 le.PutUint64(hdr[4:], seq) 4803 le.PutUint64(hdr[12:], uint64(ts)) 4804 le.PutUint16(hdr[20:], uint16(len(subj))) 4805 4806 // Now write to underlying buffer. 4807 mb.cache.buf = append(mb.cache.buf, hdr[:]...) 4808 mb.cache.buf = append(mb.cache.buf, subj...) 4809 4810 if hasHeaders { 4811 var hlen [4]byte 4812 le.PutUint32(hlen[0:], uint32(len(mhdr))) 4813 mb.cache.buf = append(mb.cache.buf, hlen[:]...) 4814 mb.cache.buf = append(mb.cache.buf, mhdr...) 4815 } 4816 mb.cache.buf = append(mb.cache.buf, msg...) 4817 4818 // Calculate hash. 4819 mb.hh.Reset() 4820 mb.hh.Write(hdr[4:20]) 4821 mb.hh.Write([]byte(subj)) 4822 if hasHeaders { 4823 mb.hh.Write(mhdr) 4824 } 4825 mb.hh.Write(msg) 4826 checksum := mb.hh.Sum(nil) 4827 // Grab last checksum 4828 copy(mb.lchk[0:], checksum) 4829 4830 // Update write through cache. 4831 // Write to msg record. 4832 mb.cache.buf = append(mb.cache.buf, checksum...) 4833 mb.cache.lrl = uint32(rl) 4834 4835 // Set cache timestamp for last store. 4836 mb.lwts = ts 4837 4838 // Only update index and do accounting if not a delete tombstone. 4839 if seq&tbit == 0 { 4840 // Accounting, do this before stripping ebit, it is ebit aware. 4841 mb.updateAccounting(seq, ts, rl) 4842 // Strip ebit if set. 4843 seq = seq &^ ebit 4844 if mb.cache.fseq == 0 { 4845 mb.cache.fseq = seq 4846 } 4847 // Write index 4848 mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit) 4849 } 4850 4851 fch, werr := mb.fch, mb.werr 4852 4853 // If we should be flushing, or had a write error, do so here. 4854 if flush || werr != nil { 4855 ld, err := mb.flushPendingMsgsLocked() 4856 if ld != nil && mb.fs != nil { 4857 // We have the mb lock here, this needs the mb locks so do in its own go routine. 4858 go mb.fs.rebuildState(ld) 4859 } 4860 if err != nil { 4861 return err 4862 } 4863 } else { 4864 // Kick the flusher here. 4865 kickFlusher(fch) 4866 } 4867 4868 return nil 4869 } 4870 4871 // How many bytes pending to be written for this message block. 4872 func (mb *msgBlock) pendingWriteSize() int { 4873 if mb == nil { 4874 return 0 4875 } 4876 mb.mu.RLock() 4877 defer mb.mu.RUnlock() 4878 return mb.pendingWriteSizeLocked() 4879 } 4880 4881 // How many bytes pending to be written for this message block. 4882 func (mb *msgBlock) pendingWriteSizeLocked() int { 4883 if mb == nil { 4884 return 0 4885 } 4886 var pending int 4887 if !mb.closed && mb.mfd != nil && mb.cache != nil { 4888 pending = len(mb.cache.buf) - int(mb.cache.wp) 4889 } 4890 return pending 4891 } 4892 4893 // Try to close our FDs if we can. 4894 func (mb *msgBlock) closeFDs() error { 4895 mb.mu.Lock() 4896 defer mb.mu.Unlock() 4897 return mb.closeFDsLocked() 4898 } 4899 4900 func (mb *msgBlock) closeFDsLocked() error { 4901 if buf, _ := mb.bytesPending(); len(buf) > 0 { 4902 return errPendingData 4903 } 4904 mb.closeFDsLockedNoCheck() 4905 return nil 4906 } 4907 4908 func (mb *msgBlock) closeFDsLockedNoCheck() { 4909 if mb.mfd != nil { 4910 mb.mfd.Close() 4911 mb.mfd = nil 4912 } 4913 } 4914 4915 // bytesPending returns the buffer to be used for writing to the underlying file. 4916 // This marks we are in flush and will return nil if asked again until cleared. 4917 // Lock should be held. 4918 func (mb *msgBlock) bytesPending() ([]byte, error) { 4919 if mb == nil || mb.mfd == nil { 4920 return nil, errNoPending 4921 } 4922 if mb.cache == nil { 4923 return nil, errNoCache 4924 } 4925 if len(mb.cache.buf) <= mb.cache.wp { 4926 return nil, errNoPending 4927 } 4928 buf := mb.cache.buf[mb.cache.wp:] 4929 if len(buf) == 0 { 4930 return nil, errNoPending 4931 } 4932 return buf, nil 4933 } 4934 4935 // Returns the current blkSize including deleted msgs etc. 4936 func (mb *msgBlock) blkSize() uint64 { 4937 mb.mu.RLock() 4938 nb := mb.rbytes 4939 mb.mu.RUnlock() 4940 return nb 4941 } 4942 4943 // Update accounting on a write msg. 4944 // Lock should be held. 4945 func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { 4946 isDeleted := seq&ebit != 0 4947 if isDeleted { 4948 seq = seq &^ ebit 4949 } 4950 4951 fseq := atomic.LoadUint64(&mb.first.seq) 4952 if (fseq == 0 || mb.first.ts == 0) && seq >= fseq { 4953 atomic.StoreUint64(&mb.first.seq, seq) 4954 mb.first.ts = ts 4955 } 4956 // Need atomics here for selectMsgBlock speed. 4957 atomic.StoreUint64(&mb.last.seq, seq) 4958 mb.last.ts = ts 4959 mb.rbytes += rl 4960 if !isDeleted { 4961 mb.bytes += rl 4962 mb.msgs++ 4963 } 4964 } 4965 4966 // Lock should be held. 4967 func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg []byte) (uint64, error) { 4968 var err error 4969 4970 // Get size for this message. 4971 rl := fileStoreMsgSize(subj, hdr, msg) 4972 if rl&hbit != 0 { 4973 return 0, ErrMsgTooLarge 4974 } 4975 // Grab our current last message block. 4976 mb := fs.lmb 4977 4978 // Mark as dirty for stream state. 4979 fs.dirty++ 4980 4981 if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize { 4982 if mb != nil && fs.fcfg.Compression != NoCompression { 4983 // We've now reached the end of this message block, if we want 4984 // to compress blocks then now's the time to do it. 4985 go mb.recompressOnDiskIfNeeded() 4986 } 4987 if mb, err = fs.newMsgBlockForWrite(); err != nil { 4988 return 0, err 4989 } 4990 } 4991 4992 // Ask msg block to store in write through cache. 4993 err = mb.writeMsgRecord(rl, seq, subj, hdr, msg, ts, fs.fip) 4994 4995 return rl, err 4996 } 4997 4998 func (mb *msgBlock) recompressOnDiskIfNeeded() error { 4999 alg := mb.fs.fcfg.Compression 5000 mb.mu.Lock() 5001 defer mb.mu.Unlock() 5002 5003 origFN := mb.mfn // The original message block on disk. 5004 tmpFN := mb.mfn + compressTmpSuffix // The compressed block will be written here. 5005 5006 // Open up the file block and read in the entire contents into memory. 5007 // One of two things will happen: 5008 // 1. The block will be compressed already and have a valid metadata 5009 // header, in which case we do nothing. 5010 // 2. The block will be uncompressed, in which case we will compress it 5011 // and then write it back out to disk, reencrypting if necessary. 5012 <-dios 5013 origBuf, err := os.ReadFile(origFN) 5014 dios <- struct{}{} 5015 5016 if err != nil { 5017 return fmt.Errorf("failed to read original block from disk: %w", err) 5018 } 5019 5020 // If the block is encrypted then we will need to decrypt it before 5021 // doing anything. We always encrypt after compressing because then the 5022 // compression can be as efficient as possible on the raw data, whereas 5023 // the encrypted ciphertext will not compress anywhere near as well. 5024 // The block encryption also covers the optional compression metadata. 5025 if mb.bek != nil && len(origBuf) > 0 { 5026 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 5027 if err != nil { 5028 return err 5029 } 5030 mb.bek = bek 5031 mb.bek.XORKeyStream(origBuf, origBuf) 5032 } 5033 5034 meta := &CompressionInfo{} 5035 if _, err := meta.UnmarshalMetadata(origBuf); err != nil { 5036 // An error is only returned here if there's a problem with parsing 5037 // the metadata. If the file has no metadata at all, no error is 5038 // returned and the algorithm defaults to no compression. 5039 return fmt.Errorf("failed to read existing metadata header: %w", err) 5040 } 5041 if meta.Algorithm == alg { 5042 // The block is already compressed with the chosen algorithm so there 5043 // is nothing else to do. This is not a common case, it is here only 5044 // to ensure we don't do unnecessary work in case something asked us 5045 // to recompress an already compressed block with the same algorithm. 5046 return nil 5047 } else if alg != NoCompression { 5048 // The block is already compressed using some algorithm, so we need 5049 // to decompress the block using the existing algorithm before we can 5050 // recompress it with the new one. 5051 if origBuf, err = meta.Algorithm.Decompress(origBuf); err != nil { 5052 return fmt.Errorf("failed to decompress original block: %w", err) 5053 } 5054 } 5055 5056 // Rather than modifying the existing block on disk (which is a dangerous 5057 // operation if something goes wrong), create a new temporary file. We will 5058 // write out the new block here and then swap the files around afterwards 5059 // once everything else has succeeded correctly. 5060 <-dios 5061 tmpFD, err := os.OpenFile(tmpFN, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, defaultFilePerms) 5062 dios <- struct{}{} 5063 if err != nil { 5064 return fmt.Errorf("failed to create temporary file: %w", err) 5065 } 5066 5067 // The original buffer at this point is uncompressed, so we will now compress 5068 // it if needed. Note that if the selected algorithm is NoCompression, the 5069 // Compress function will just return the input buffer unmodified. 5070 cmpBuf, err := alg.Compress(origBuf) 5071 if err != nil { 5072 return fmt.Errorf("failed to compress block: %w", err) 5073 } 5074 5075 // We only need to write out the metadata header if compression is enabled. 5076 // If we're trying to uncompress the file on disk at this point, don't bother 5077 // writing metadata. 5078 if alg != NoCompression { 5079 meta := &CompressionInfo{ 5080 Algorithm: alg, 5081 OriginalSize: uint64(len(origBuf)), 5082 } 5083 cmpBuf = append(meta.MarshalMetadata(), cmpBuf...) 5084 } 5085 5086 // Re-encrypt the block if necessary. 5087 if mb.bek != nil && len(cmpBuf) > 0 { 5088 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 5089 if err != nil { 5090 return err 5091 } 5092 mb.bek = bek 5093 mb.bek.XORKeyStream(cmpBuf, cmpBuf) 5094 } 5095 5096 // Write the new block data (which might be compressed or encrypted) to the 5097 // temporary file. 5098 errorCleanup := func(err error) error { 5099 tmpFD.Close() 5100 os.Remove(tmpFN) 5101 return err 5102 } 5103 if n, err := tmpFD.Write(cmpBuf); err != nil { 5104 return errorCleanup(fmt.Errorf("failed to write to temporary file: %w", err)) 5105 } else if n != len(cmpBuf) { 5106 return errorCleanup(fmt.Errorf("short write to temporary file (%d != %d)", n, len(cmpBuf))) 5107 } 5108 if err := tmpFD.Sync(); err != nil { 5109 return errorCleanup(fmt.Errorf("failed to sync temporary file: %w", err)) 5110 } 5111 if err := tmpFD.Close(); err != nil { 5112 return errorCleanup(fmt.Errorf("failed to close temporary file: %w", err)) 5113 } 5114 5115 // Now replace the original file with the newly updated temp file. 5116 if err := os.Rename(tmpFN, origFN); err != nil { 5117 return fmt.Errorf("failed to move temporary file into place: %w", err) 5118 } 5119 5120 // Since the message block might be retained in memory, make sure the 5121 // compression algorithm is up-to-date, since this will be needed when 5122 // compacting or truncating. 5123 mb.cmp = alg 5124 return nil 5125 } 5126 5127 func (mb *msgBlock) decompressIfNeeded(buf []byte) ([]byte, error) { 5128 var meta CompressionInfo 5129 if n, err := meta.UnmarshalMetadata(buf); err != nil { 5130 // There was a problem parsing the metadata header of the block. 5131 // If there's no metadata header, an error isn't returned here, 5132 // we will instead just use default values of no compression. 5133 return nil, err 5134 } else if n == 0 { 5135 // There were no metadata bytes, so we assume the block is not 5136 // compressed and return it as-is. 5137 return buf, nil 5138 } else { 5139 // Metadata was present so it's quite likely the block contents 5140 // are compressed. If by any chance the metadata claims that the 5141 // block is uncompressed, then the input slice is just returned 5142 // unmodified. 5143 return meta.Algorithm.Decompress(buf[n:]) 5144 } 5145 } 5146 5147 // Lock should be held. 5148 func (mb *msgBlock) ensureRawBytesLoaded() error { 5149 if mb.rbytes > 0 { 5150 return nil 5151 } 5152 f, err := mb.openBlock() 5153 if err != nil { 5154 return err 5155 } 5156 defer f.Close() 5157 if fi, err := f.Stat(); fi != nil && err == nil { 5158 mb.rbytes = uint64(fi.Size()) 5159 } else { 5160 return err 5161 } 5162 return nil 5163 } 5164 5165 // Sync msg and index files as needed. This is called from a timer. 5166 func (fs *fileStore) syncBlocks() { 5167 fs.mu.RLock() 5168 if fs.closed { 5169 fs.mu.RUnlock() 5170 return 5171 } 5172 blks := append([]*msgBlock(nil), fs.blks...) 5173 lmb := fs.lmb 5174 syncInterval := fs.fcfg.SyncInterval 5175 fs.mu.RUnlock() 5176 5177 var markDirty bool 5178 for _, mb := range blks { 5179 // Do actual sync. Hold lock for consistency. 5180 mb.mu.Lock() 5181 if mb.closed { 5182 mb.mu.Unlock() 5183 continue 5184 } 5185 // See if we can close FDs due to being idle. 5186 if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle { 5187 mb.dirtyCloseWithRemove(false) 5188 } 5189 // Check our fss subject metadata. 5190 // If we have no activity within sync interval remove. 5191 if mb.fssLoaded() && mb.sinceLastActivity() > syncInterval { 5192 mb.fss = nil 5193 } 5194 5195 // Check if we should compact here as well. 5196 // Do not compact last mb. 5197 var needsCompact bool 5198 if mb != lmb && mb.ensureRawBytesLoaded() == nil && mb.rbytes > mb.bytes { 5199 needsCompact = true 5200 markDirty = true 5201 } 5202 5203 // Check if we need to sync. We will not hold lock during actual sync. 5204 needSync, fn := mb.needSync, mb.mfn 5205 if needSync { 5206 // Flush anything that may be pending. 5207 mb.flushPendingMsgsLocked() 5208 } 5209 mb.mu.Unlock() 5210 5211 // Check if we should compact here. 5212 // Need to hold fs lock in case we reference psim when loading in the mb. 5213 if needsCompact { 5214 fs.mu.RLock() 5215 mb.mu.Lock() 5216 mb.compact() 5217 mb.mu.Unlock() 5218 fs.mu.RUnlock() 5219 } 5220 5221 // Check if we need to sync. 5222 // This is done not holding any locks. 5223 if needSync { 5224 <-dios 5225 fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms) 5226 dios <- struct{}{} 5227 // If we have an fd. 5228 if fd != nil { 5229 canClear := fd.Sync() == nil 5230 fd.Close() 5231 // Only clear sync flag on success. 5232 if canClear { 5233 mb.mu.Lock() 5234 mb.needSync = false 5235 mb.mu.Unlock() 5236 } 5237 } 5238 } 5239 } 5240 5241 fs.mu.Lock() 5242 if fs.closed { 5243 fs.mu.Unlock() 5244 return 5245 } 5246 fs.setSyncTimer() 5247 fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) 5248 syncAlways := fs.fcfg.SyncAlways 5249 if markDirty { 5250 fs.dirty++ 5251 } 5252 fs.mu.Unlock() 5253 5254 // Sync state file if we are not running with sync always. 5255 if !syncAlways { 5256 <-dios 5257 fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms) 5258 dios <- struct{}{} 5259 if fd != nil { 5260 fd.Sync() 5261 fd.Close() 5262 } 5263 } 5264 } 5265 5266 // Select the message block where this message should be found. 5267 // Return nil if not in the set. 5268 // Read lock should be held. 5269 func (fs *fileStore) selectMsgBlock(seq uint64) *msgBlock { 5270 _, mb := fs.selectMsgBlockWithIndex(seq) 5271 return mb 5272 } 5273 5274 // Lock should be held. 5275 func (fs *fileStore) selectMsgBlockWithIndex(seq uint64) (int, *msgBlock) { 5276 // Check for out of range. 5277 if seq < fs.state.FirstSeq || seq > fs.state.LastSeq { 5278 return -1, nil 5279 } 5280 5281 const linearThresh = 32 5282 nb := len(fs.blks) - 1 5283 5284 if nb < linearThresh { 5285 for i, mb := range fs.blks { 5286 if seq <= atomic.LoadUint64(&mb.last.seq) { 5287 return i, mb 5288 } 5289 } 5290 return -1, nil 5291 } 5292 5293 // Do traditional binary search here since we know the blocks are sorted by sequence first and last. 5294 for low, high, mid := 0, nb, nb/2; low <= high; mid = (low + high) / 2 { 5295 mb := fs.blks[mid] 5296 // Right now these atomic loads do not factor in, so fine to leave. Was considering 5297 // uplifting these to fs scope to avoid atomic load but not needed. 5298 first, last := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 5299 if seq > last { 5300 low = mid + 1 5301 } else if seq < first { 5302 // A message block's first sequence can change here meaning we could find a gap. 5303 // We want to behave like above, which if inclusive (we check at start) should 5304 // always return an index and a valid mb. 5305 // If we have a gap then our seq would be > fs.blks[mid-1].last.seq 5306 if mid == 0 || seq > atomic.LoadUint64(&fs.blks[mid-1].last.seq) { 5307 return mid, mb 5308 } 5309 high = mid - 1 5310 } else { 5311 return mid, mb 5312 } 5313 } 5314 5315 return -1, nil 5316 } 5317 5318 // Select the message block where this message should be found. 5319 // Return nil if not in the set. 5320 func (fs *fileStore) selectMsgBlockForStart(minTime time.Time) *msgBlock { 5321 fs.mu.RLock() 5322 defer fs.mu.RUnlock() 5323 5324 t := minTime.UnixNano() 5325 for _, mb := range fs.blks { 5326 mb.mu.RLock() 5327 found := t <= mb.last.ts 5328 mb.mu.RUnlock() 5329 if found { 5330 return mb 5331 } 5332 } 5333 return nil 5334 } 5335 5336 // Index a raw msg buffer. 5337 // Lock should be held. 5338 func (mb *msgBlock) indexCacheBuf(buf []byte) error { 5339 var le = binary.LittleEndian 5340 5341 var fseq uint64 5342 var idx []uint32 5343 var index uint32 5344 5345 mbFirstSeq := atomic.LoadUint64(&mb.first.seq) 5346 mbLastSeq := atomic.LoadUint64(&mb.last.seq) 5347 5348 // Capture beginning size of dmap. 5349 dms := uint64(mb.dmap.Size()) 5350 idxSz := mbLastSeq - mbFirstSeq + 1 5351 5352 if mb.cache == nil { 5353 // Approximation, may adjust below. 5354 fseq = mbFirstSeq 5355 idx = make([]uint32, 0, idxSz) 5356 mb.cache = &cache{} 5357 } else { 5358 fseq = mb.cache.fseq 5359 idx = mb.cache.idx 5360 if len(idx) == 0 { 5361 idx = make([]uint32, 0, idxSz) 5362 } 5363 index = uint32(len(mb.cache.buf)) 5364 buf = append(mb.cache.buf, buf...) 5365 } 5366 5367 // Create FSS if we should track. 5368 var popFss bool 5369 if mb.fssNotLoaded() { 5370 mb.fss = make(map[string]*SimpleState) 5371 popFss = true 5372 } 5373 5374 lbuf := uint32(len(buf)) 5375 var seq uint64 5376 for index < lbuf { 5377 if index+msgHdrSize > lbuf { 5378 return errCorruptState 5379 } 5380 hdr := buf[index : index+msgHdrSize] 5381 rl, slen := le.Uint32(hdr[0:]), int(le.Uint16(hdr[20:])) 5382 seq = le.Uint64(hdr[4:]) 5383 5384 // Clear any headers bit that could be set. 5385 rl &^= hbit 5386 dlen := int(rl) - msgHdrSize 5387 5388 // Do some quick sanity checks here. 5389 if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { 5390 // This means something is off. 5391 // TODO(dlc) - Add into bad list? 5392 return errCorruptState 5393 } 5394 5395 // Check for tombstones which we can skip in terms of indexing. 5396 if seq&tbit != 0 { 5397 index += rl 5398 continue 5399 } 5400 5401 // Clear any erase bits. 5402 erased := seq&ebit != 0 5403 seq = seq &^ ebit 5404 5405 // We defer checksum checks to individual msg cache lookups to amortorize costs and 5406 // not introduce latency for first message from a newly loaded block. 5407 if seq >= mbFirstSeq { 5408 // Track that we do not have holes. 5409 if slot := int(seq - mbFirstSeq); slot != len(idx) { 5410 // If we have a hole fill it. 5411 for dseq := mbFirstSeq + uint64(len(idx)); dseq < seq; dseq++ { 5412 idx = append(idx, dbit) 5413 if dms == 0 { 5414 mb.dmap.Insert(dseq) 5415 } 5416 } 5417 } 5418 // Add to our index. 5419 idx = append(idx, index) 5420 mb.cache.lrl = uint32(rl) 5421 // Adjust if we guessed wrong. 5422 if seq != 0 && seq < fseq { 5423 fseq = seq 5424 } 5425 5426 // Make sure our dmap has this entry if it was erased. 5427 if erased && dms == 0 { 5428 mb.dmap.Insert(seq) 5429 } 5430 5431 // Handle FSS inline here. 5432 if popFss && slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) { 5433 bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)] 5434 if ss := mb.fss[string(bsubj)]; ss != nil { 5435 ss.Msgs++ 5436 ss.Last = seq 5437 } else { 5438 mb.fss[string(bsubj)] = &SimpleState{ 5439 Msgs: 1, 5440 First: seq, 5441 Last: seq, 5442 } 5443 } 5444 } 5445 } 5446 index += rl 5447 } 5448 5449 // Track holes at the end of the block, these would be missed in the 5450 // earlier loop if we've ran out of block file to look at, but should 5451 // be easily noticed because the seq will be below the last seq from 5452 // the index. 5453 if seq > 0 && seq < mbLastSeq { 5454 for dseq := seq; dseq < mbLastSeq; dseq++ { 5455 idx = append(idx, dbit) 5456 if dms == 0 { 5457 mb.dmap.Insert(dseq) 5458 } 5459 } 5460 } 5461 5462 mb.cache.buf = buf 5463 mb.cache.idx = idx 5464 mb.cache.fseq = fseq 5465 mb.cache.wp += int(lbuf) 5466 5467 return nil 5468 } 5469 5470 // flushPendingMsgs writes out any messages for this message block. 5471 func (mb *msgBlock) flushPendingMsgs() error { 5472 mb.mu.Lock() 5473 fsLostData, err := mb.flushPendingMsgsLocked() 5474 fs := mb.fs 5475 mb.mu.Unlock() 5476 5477 // Signals us that we need to rebuild filestore state. 5478 if fsLostData != nil && fs != nil { 5479 // Rebuild fs state too. 5480 fs.rebuildState(fsLostData) 5481 } 5482 return err 5483 } 5484 5485 // Write function for actual data. 5486 // mb.mfd should not be nil. 5487 // Lock should held. 5488 func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) { 5489 // Used to mock write failures. 5490 if mb.mockWriteErr { 5491 // Reset on trip. 5492 mb.mockWriteErr = false 5493 return 0, errors.New("mock write error") 5494 } 5495 return mb.mfd.WriteAt(buf, woff) 5496 } 5497 5498 // flushPendingMsgsLocked writes out any messages for this message block. 5499 // Lock should be held. 5500 func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { 5501 // Signals us that we need to rebuild filestore state. 5502 var fsLostData *LostStreamData 5503 5504 if mb.cache == nil || mb.mfd == nil { 5505 return nil, nil 5506 } 5507 5508 buf, err := mb.bytesPending() 5509 // If we got an error back return here. 5510 if err != nil { 5511 // No pending data to be written is not an error. 5512 if err == errNoPending || err == errNoCache { 5513 err = nil 5514 } 5515 return nil, err 5516 } 5517 5518 woff := int64(mb.cache.off + mb.cache.wp) 5519 lob := len(buf) 5520 5521 // TODO(dlc) - Normally we would not hold the lock across I/O so we can improve performance. 5522 // We will hold to stabilize the code base, as we have had a few anomalies with partial cache errors 5523 // under heavy load. 5524 5525 // Check if we need to encrypt. 5526 if mb.bek != nil && lob > 0 { 5527 // Need to leave original alone. 5528 var dst []byte 5529 if lob <= defaultLargeBlockSize { 5530 dst = getMsgBlockBuf(lob)[:lob] 5531 } else { 5532 dst = make([]byte, lob) 5533 } 5534 mb.bek.XORKeyStream(dst, buf) 5535 buf = dst 5536 } 5537 5538 // Append new data to the message block file. 5539 for lbb := lob; lbb > 0; lbb = len(buf) { 5540 n, err := mb.writeAt(buf, woff) 5541 if err != nil { 5542 mb.dirtyCloseWithRemove(false) 5543 ld, _, _ := mb.rebuildStateLocked() 5544 mb.werr = err 5545 return ld, err 5546 } 5547 // Update our write offset. 5548 woff += int64(n) 5549 // Partial write. 5550 if n != lbb { 5551 buf = buf[n:] 5552 } else { 5553 // Done. 5554 break 5555 } 5556 } 5557 5558 // Clear any error. 5559 mb.werr = nil 5560 5561 // Cache may be gone. 5562 if mb.cache == nil || mb.mfd == nil { 5563 return fsLostData, mb.werr 5564 } 5565 5566 // Check if we are in sync always mode. 5567 if mb.syncAlways { 5568 mb.mfd.Sync() 5569 } else { 5570 mb.needSync = true 5571 } 5572 5573 // Check for additional writes while we were writing to the disk. 5574 moreBytes := len(mb.cache.buf) - mb.cache.wp - lob 5575 5576 // Decide what we want to do with the buffer in hand. If we have load interest 5577 // we will hold onto the whole thing, otherwise empty the buffer, possibly reusing it. 5578 if ts := time.Now().UnixNano(); ts < mb.llts || (ts-mb.llts) <= int64(mb.cexp) { 5579 mb.cache.wp += lob 5580 } else { 5581 if cap(mb.cache.buf) <= maxBufReuse { 5582 buf = mb.cache.buf[:0] 5583 } else { 5584 recycleMsgBlockBuf(mb.cache.buf) 5585 buf = nil 5586 } 5587 if moreBytes > 0 { 5588 nbuf := mb.cache.buf[len(mb.cache.buf)-moreBytes:] 5589 if moreBytes > (len(mb.cache.buf)/4*3) && cap(nbuf) <= maxBufReuse { 5590 buf = nbuf 5591 } else { 5592 buf = append(buf, nbuf...) 5593 } 5594 } 5595 // Update our cache offset. 5596 mb.cache.off = int(woff) 5597 // Reset write pointer. 5598 mb.cache.wp = 0 5599 // Place buffer back in the cache structure. 5600 mb.cache.buf = buf 5601 // Mark fseq to 0 5602 mb.cache.fseq = 0 5603 } 5604 5605 return fsLostData, mb.werr 5606 } 5607 5608 // Lock should be held. 5609 func (mb *msgBlock) clearLoading() { 5610 mb.loading = false 5611 } 5612 5613 // Will load msgs from disk. 5614 func (mb *msgBlock) loadMsgs() error { 5615 // We hold the lock here the whole time by design. 5616 mb.mu.Lock() 5617 defer mb.mu.Unlock() 5618 return mb.loadMsgsWithLock() 5619 } 5620 5621 // Lock should be held. 5622 func (mb *msgBlock) cacheAlreadyLoaded() bool { 5623 if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 { 5624 return false 5625 } 5626 numEntries := mb.msgs + uint64(mb.dmap.Size()) + (atomic.LoadUint64(&mb.first.seq) - mb.cache.fseq) 5627 return numEntries == uint64(len(mb.cache.idx)) 5628 } 5629 5630 // Lock should be held. 5631 func (mb *msgBlock) cacheNotLoaded() bool { 5632 return !mb.cacheAlreadyLoaded() 5633 } 5634 5635 // Report if our fss is not loaded. 5636 // Lock should be held. 5637 func (mb *msgBlock) fssNotLoaded() bool { 5638 return mb.fss == nil && !mb.noTrack 5639 } 5640 5641 // Report if we have our fss loaded. 5642 // Lock should be held. 5643 func (mb *msgBlock) fssLoaded() bool { 5644 return mb.fss != nil 5645 } 5646 5647 // Wrap openBlock for the gated semaphore processing. 5648 // Lock should be held 5649 func (mb *msgBlock) openBlock() (*os.File, error) { 5650 // Gate with concurrent IO semaphore. 5651 <-dios 5652 f, err := os.Open(mb.mfn) 5653 dios <- struct{}{} 5654 return f, err 5655 } 5656 5657 // Used to load in the block contents. 5658 // Lock should be held and all conditionals satisfied prior. 5659 func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) { 5660 var f *os.File 5661 // Re-use if we have mfd open. 5662 if mb.mfd != nil { 5663 f = mb.mfd 5664 if n, err := f.Seek(0, 0); n != 0 || err != nil { 5665 f = nil 5666 mb.closeFDsLockedNoCheck() 5667 } 5668 } 5669 if f == nil { 5670 var err error 5671 f, err = mb.openBlock() 5672 if err != nil { 5673 if os.IsNotExist(err) { 5674 err = errNoBlkData 5675 } 5676 return nil, err 5677 } 5678 defer f.Close() 5679 } 5680 5681 var sz int 5682 if info, err := f.Stat(); err == nil { 5683 sz64 := info.Size() 5684 if int64(int(sz64)) == sz64 { 5685 sz = int(sz64) 5686 } else { 5687 return nil, errMsgBlkTooBig 5688 } 5689 } 5690 5691 if buf == nil { 5692 buf = getMsgBlockBuf(sz) 5693 if sz > cap(buf) { 5694 // We know we will make a new one so just recycle for now. 5695 recycleMsgBlockBuf(buf) 5696 buf = nil 5697 } 5698 } 5699 5700 if sz > cap(buf) { 5701 buf = make([]byte, sz) 5702 } else { 5703 buf = buf[:sz] 5704 } 5705 5706 <-dios 5707 n, err := io.ReadFull(f, buf) 5708 dios <- struct{}{} 5709 // On success capture raw bytes size. 5710 if err == nil { 5711 mb.rbytes = uint64(n) 5712 } 5713 return buf[:n], err 5714 } 5715 5716 // Lock should be held. 5717 func (mb *msgBlock) loadMsgsWithLock() error { 5718 // Check for encryption, we do not load keys on startup anymore so might need to load them here. 5719 if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) { 5720 if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil { 5721 return err 5722 } 5723 } 5724 5725 // Check to see if we are loading already. 5726 if mb.loading { 5727 return nil 5728 } 5729 5730 // Set loading status. 5731 mb.loading = true 5732 defer mb.clearLoading() 5733 5734 var nchecks int 5735 5736 checkCache: 5737 nchecks++ 5738 if nchecks > 8 { 5739 return errCorruptState 5740 } 5741 5742 // Check to see if we have a full cache. 5743 if mb.cacheAlreadyLoaded() { 5744 return nil 5745 } 5746 5747 mb.llts = time.Now().UnixNano() 5748 5749 // FIXME(dlc) - We could be smarter here. 5750 if buf, _ := mb.bytesPending(); len(buf) > 0 { 5751 ld, err := mb.flushPendingMsgsLocked() 5752 if ld != nil && mb.fs != nil { 5753 // We do not know if fs is locked or not at this point. 5754 // This should be an exceptional condition so do so in Go routine. 5755 go mb.fs.rebuildState(ld) 5756 } 5757 if err != nil { 5758 return err 5759 } 5760 goto checkCache 5761 } 5762 5763 // Load in the whole block. 5764 // We want to hold the mb lock here to avoid any changes to state. 5765 buf, err := mb.loadBlock(nil) 5766 if err != nil { 5767 if err == errNoBlkData { 5768 if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil { 5769 // Rebuild fs state too. 5770 go mb.fs.rebuildState(ld) 5771 } 5772 } 5773 return err 5774 } 5775 5776 // Reset the cache since we just read everything in. 5777 // Make sure this is cleared in case we had a partial when we started. 5778 mb.clearCacheAndOffset() 5779 5780 // Check if we need to decrypt. 5781 if mb.bek != nil && len(buf) > 0 { 5782 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 5783 if err != nil { 5784 return err 5785 } 5786 mb.bek = bek 5787 mb.bek.XORKeyStream(buf, buf) 5788 } 5789 5790 // Check for compression. 5791 if buf, err = mb.decompressIfNeeded(buf); err != nil { 5792 return err 5793 } 5794 5795 if err := mb.indexCacheBuf(buf); err != nil { 5796 if err == errCorruptState { 5797 var ld *LostStreamData 5798 if ld, _, err = mb.rebuildStateLocked(); ld != nil { 5799 // We do not know if fs is locked or not at this point. 5800 // This should be an exceptional condition so do so in Go routine. 5801 go mb.fs.rebuildState(ld) 5802 } 5803 } 5804 if err != nil { 5805 return err 5806 } 5807 goto checkCache 5808 } 5809 5810 if len(buf) > 0 { 5811 mb.cloads++ 5812 mb.startCacheExpireTimer() 5813 } 5814 5815 return nil 5816 } 5817 5818 // Fetch a message from this block, possibly reading in and caching the messages. 5819 // We assume the block was selected and is correct, so we do not do range checks. 5820 func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) { 5821 mb.mu.Lock() 5822 defer mb.mu.Unlock() 5823 5824 fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 5825 if seq < fseq || seq > lseq { 5826 return nil, false, ErrStoreMsgNotFound 5827 } 5828 5829 // See if we can short circuit if we already know msg deleted. 5830 if mb.dmap.Exists(seq) { 5831 // Update for scanning like cacheLookup would have. 5832 llseq := mb.llseq 5833 if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 { 5834 mb.llseq = seq 5835 } 5836 expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1) 5837 return nil, expireOk, errDeletedMsg 5838 } 5839 5840 if mb.cacheNotLoaded() { 5841 if err := mb.loadMsgsWithLock(); err != nil { 5842 return nil, false, err 5843 } 5844 } 5845 llseq := mb.llseq 5846 5847 fsm, err := mb.cacheLookup(seq, sm) 5848 if err != nil { 5849 return nil, false, err 5850 } 5851 expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1) 5852 return fsm, expireOk, err 5853 } 5854 5855 var ( 5856 errNoCache = errors.New("no message cache") 5857 errBadMsg = errors.New("malformed or corrupt message") 5858 errDeletedMsg = errors.New("deleted message") 5859 errPartialCache = errors.New("partial cache") 5860 errNoPending = errors.New("message block does not have pending data") 5861 errNotReadable = errors.New("storage directory not readable") 5862 errCorruptState = errors.New("corrupt state file") 5863 errPriorState = errors.New("prior state file") 5864 errPendingData = errors.New("pending data still present") 5865 errNoEncryption = errors.New("encryption not enabled") 5866 errBadKeySize = errors.New("encryption bad key size") 5867 errNoMsgBlk = errors.New("no message block") 5868 errMsgBlkTooBig = errors.New("message block size exceeded int capacity") 5869 errUnknownCipher = errors.New("unknown cipher") 5870 errNoMainKey = errors.New("encrypted store encountered with no main key") 5871 errNoBlkData = errors.New("message block data missing") 5872 ) 5873 5874 const ( 5875 // Used for marking messages that have had their checksums checked. 5876 // Used to signal a message record with headers. 5877 hbit = 1 << 31 5878 // Used for marking erased messages sequences. 5879 ebit = 1 << 63 5880 // Used for marking tombstone sequences. 5881 tbit = 1 << 62 5882 // Used to mark an index as deleted and non-existent. 5883 dbit = 1 << 30 5884 ) 5885 5886 // Will do a lookup from cache. 5887 // Lock should be held. 5888 func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) { 5889 if seq < atomic.LoadUint64(&mb.first.seq) || seq > atomic.LoadUint64(&mb.last.seq) { 5890 return nil, ErrStoreMsgNotFound 5891 } 5892 5893 // The llseq signals us when we can expire a cache at the end of a linear scan. 5894 // We want to only update when we know the last reads (multiple consumers) are sequential. 5895 // We want to account for forwards and backwards linear scans. 5896 if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 { 5897 mb.llseq = seq 5898 } 5899 5900 // If we have a delete map check it. 5901 if mb.dmap.Exists(seq) { 5902 mb.llts = time.Now().UnixNano() 5903 return nil, errDeletedMsg 5904 } 5905 5906 // Detect no cache loaded. 5907 if mb.cache == nil || mb.cache.fseq == 0 || len(mb.cache.idx) == 0 || len(mb.cache.buf) == 0 { 5908 return nil, errNoCache 5909 } 5910 // Check partial cache status. 5911 if seq < mb.cache.fseq { 5912 return nil, errPartialCache 5913 } 5914 5915 bi, _, hashChecked, err := mb.slotInfo(int(seq - mb.cache.fseq)) 5916 if err != nil { 5917 return nil, err 5918 } 5919 5920 // Update cache activity. 5921 mb.llts = time.Now().UnixNano() 5922 5923 li := int(bi) - mb.cache.off 5924 if li >= len(mb.cache.buf) { 5925 return nil, errPartialCache 5926 } 5927 buf := mb.cache.buf[li:] 5928 5929 // We use the high bit to denote we have already checked the checksum. 5930 var hh hash.Hash64 5931 if !hashChecked { 5932 hh = mb.hh // This will force the hash check in msgFromBuf. 5933 } 5934 5935 // Parse from the raw buffer. 5936 fsm, err := mb.msgFromBuf(buf, sm, hh) 5937 if err != nil || fsm == nil { 5938 return nil, err 5939 } 5940 5941 // Deleted messages that are decoded return a 0 for sequence. 5942 if fsm.seq == 0 { 5943 return nil, errDeletedMsg 5944 } 5945 5946 if seq != fsm.seq { 5947 recycleMsgBlockBuf(mb.cache.buf) 5948 mb.cache.buf = nil 5949 return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq) 5950 } 5951 5952 // Clear the check bit here after we know all is good. 5953 if !hashChecked { 5954 mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit) 5955 } 5956 5957 return fsm, nil 5958 } 5959 5960 // Used when we are checking if discarding a message due to max msgs per subject will give us 5961 // enough room for a max bytes condition. 5962 // Lock should be already held. 5963 func (fs *fileStore) sizeForSeq(seq uint64) int { 5964 if seq == 0 { 5965 return 0 5966 } 5967 var smv StoreMsg 5968 if mb := fs.selectMsgBlock(seq); mb != nil { 5969 if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil { 5970 return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)) 5971 } 5972 } 5973 return 0 5974 } 5975 5976 // Will return message for the given sequence number. 5977 func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) { 5978 // TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will 5979 // be stalled. Need another lock if want to happen in parallel. 5980 fs.mu.RLock() 5981 if fs.closed { 5982 fs.mu.RUnlock() 5983 return nil, ErrStoreClosed 5984 } 5985 // Indicates we want first msg. 5986 if seq == 0 { 5987 seq = fs.state.FirstSeq 5988 } 5989 // Make sure to snapshot here. 5990 mb, lseq := fs.selectMsgBlock(seq), fs.state.LastSeq 5991 fs.mu.RUnlock() 5992 5993 if mb == nil { 5994 var err = ErrStoreEOF 5995 if seq <= lseq { 5996 err = ErrStoreMsgNotFound 5997 } 5998 return nil, err 5999 } 6000 6001 fsm, expireOk, err := mb.fetchMsg(seq, sm) 6002 if err != nil { 6003 return nil, err 6004 } 6005 6006 // We detected a linear scan and access to the last message. 6007 // If we are not the last message block we can try to expire the cache. 6008 if expireOk { 6009 mb.tryForceExpireCache() 6010 } 6011 6012 return fsm, nil 6013 } 6014 6015 // Internal function to return msg parts from a raw buffer. 6016 // Lock should be held. 6017 func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) { 6018 if len(buf) < emptyRecordLen { 6019 return nil, errBadMsg 6020 } 6021 var le = binary.LittleEndian 6022 6023 hdr := buf[:msgHdrSize] 6024 rl := le.Uint32(hdr[0:]) 6025 hasHeaders := rl&hbit != 0 6026 rl &^= hbit // clear header bit 6027 dlen := int(rl) - msgHdrSize 6028 slen := int(le.Uint16(hdr[20:])) 6029 // Simple sanity check. 6030 if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || int(rl) > len(buf) { 6031 return nil, errBadMsg 6032 } 6033 data := buf[msgHdrSize : msgHdrSize+dlen] 6034 // Do checksum tests here if requested. 6035 if hh != nil { 6036 hh.Reset() 6037 hh.Write(hdr[4:20]) 6038 hh.Write(data[:slen]) 6039 if hasHeaders { 6040 hh.Write(data[slen+4 : dlen-recordHashSize]) 6041 } else { 6042 hh.Write(data[slen : dlen-recordHashSize]) 6043 } 6044 if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) { 6045 return nil, errBadMsg 6046 } 6047 } 6048 seq := le.Uint64(hdr[4:]) 6049 if seq&ebit != 0 { 6050 seq = 0 6051 } 6052 ts := int64(le.Uint64(hdr[12:])) 6053 6054 // Create a StoreMsg if needed. 6055 if sm == nil { 6056 sm = new(StoreMsg) 6057 } else { 6058 sm.clear() 6059 } 6060 // To recycle the large blocks we can never pass back a reference, so need to copy for the upper 6061 // layers and for us to be safe to expire, and recycle, the large msgBlocks. 6062 end := dlen - 8 6063 6064 if hasHeaders { 6065 hl := le.Uint32(data[slen:]) 6066 bi := slen + 4 6067 li := bi + int(hl) 6068 sm.buf = append(sm.buf, data[bi:end]...) 6069 li, end = li-bi, end-bi 6070 sm.hdr = sm.buf[0:li:li] 6071 sm.msg = sm.buf[li:end] 6072 } else { 6073 sm.buf = append(sm.buf, data[slen:end]...) 6074 sm.msg = sm.buf[0 : end-slen] 6075 } 6076 sm.seq, sm.ts = seq, ts 6077 if slen > 0 { 6078 // Make a copy since sm.subj lifetime may last longer. 6079 sm.subj = string(data[:slen]) 6080 } 6081 6082 return sm, nil 6083 } 6084 6085 // LoadMsg will lookup the message by sequence number and return it if found. 6086 func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) { 6087 return fs.msgForSeq(seq, sm) 6088 } 6089 6090 // loadLast will load the last message for a subject. Subject should be non empty and not ">". 6091 func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) { 6092 fs.mu.RLock() 6093 defer fs.mu.RUnlock() 6094 6095 if fs.closed || fs.lmb == nil { 6096 return nil, ErrStoreClosed 6097 } 6098 6099 if len(fs.blks) == 0 { 6100 return nil, ErrStoreMsgNotFound 6101 } 6102 6103 start, stop := fs.lmb.index, fs.blks[0].index 6104 wc := subjectHasWildcard(subj) 6105 // If literal subject check for presence. 6106 if !wc { 6107 if info, ok := fs.psim.Find(stringToBytes(subj)); !ok { 6108 return nil, ErrStoreMsgNotFound 6109 } else { 6110 start, stop = info.lblk, info.fblk 6111 } 6112 } 6113 6114 // Walk blocks backwards. 6115 for i := start; i >= stop; i-- { 6116 mb := fs.bim[i] 6117 if mb == nil { 6118 continue 6119 } 6120 mb.mu.Lock() 6121 if err := mb.ensurePerSubjectInfoLoaded(); err != nil { 6122 mb.mu.Unlock() 6123 return nil, err 6124 } 6125 var l uint64 6126 // Optimize if subject is not a wildcard. 6127 if !wc { 6128 if ss := mb.fss[subj]; ss != nil { 6129 l = ss.Last 6130 } 6131 } 6132 if l == 0 { 6133 _, _, l = mb.filteredPendingLocked(subj, wc, atomic.LoadUint64(&mb.first.seq)) 6134 } 6135 if l > 0 { 6136 if mb.cacheNotLoaded() { 6137 if err := mb.loadMsgsWithLock(); err != nil { 6138 mb.mu.Unlock() 6139 return nil, err 6140 } 6141 } 6142 lsm, err = mb.cacheLookup(l, sm) 6143 } 6144 mb.mu.Unlock() 6145 if l > 0 { 6146 break 6147 } 6148 } 6149 return lsm, err 6150 } 6151 6152 // LoadLastMsg will return the last message we have that matches a given subject. 6153 // The subject can be a wildcard. 6154 func (fs *fileStore) LoadLastMsg(subject string, smv *StoreMsg) (sm *StoreMsg, err error) { 6155 if subject == _EMPTY_ || subject == fwcs { 6156 sm, err = fs.msgForSeq(fs.lastSeq(), smv) 6157 } else { 6158 sm, err = fs.loadLast(subject, smv) 6159 } 6160 if sm == nil || (err != nil && err != ErrStoreClosed) { 6161 err = ErrStoreMsgNotFound 6162 } 6163 return sm, err 6164 } 6165 6166 func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) { 6167 fs.mu.RLock() 6168 defer fs.mu.RUnlock() 6169 6170 if fs.closed { 6171 return nil, 0, ErrStoreClosed 6172 } 6173 if start < fs.state.FirstSeq { 6174 start = fs.state.FirstSeq 6175 } 6176 6177 // If start is less than or equal to beginning of our stream, meaning our first call, 6178 // let's check the psim to see if we can skip ahead. 6179 if start <= fs.state.FirstSeq { 6180 var ss SimpleState 6181 fs.numFilteredPending(filter, &ss) 6182 if ss.First > start { 6183 start = ss.First 6184 } 6185 } 6186 6187 if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 { 6188 for i := bi; i < len(fs.blks); i++ { 6189 mb := fs.blks[i] 6190 if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil { 6191 if expireOk { 6192 mb.tryForceExpireCache() 6193 } 6194 return sm, sm.seq, nil 6195 } else if err != ErrStoreMsgNotFound { 6196 return nil, 0, err 6197 } else if expireOk { 6198 mb.tryForceExpireCache() 6199 } 6200 } 6201 } 6202 6203 return nil, fs.state.LastSeq, ErrStoreEOF 6204 } 6205 6206 // Type returns the type of the underlying store. 6207 func (fs *fileStore) Type() StorageType { 6208 return FileStorage 6209 } 6210 6211 // Returns number of subjects in this store. 6212 // Lock should be held. 6213 func (fs *fileStore) numSubjects() int { 6214 return fs.psim.Size() 6215 } 6216 6217 // numConsumers uses new lock. 6218 func (fs *fileStore) numConsumers() int { 6219 fs.cmu.RLock() 6220 defer fs.cmu.RUnlock() 6221 return len(fs.cfs) 6222 } 6223 6224 // FastState will fill in state with only the following. 6225 // Msgs, Bytes, First and Last Sequence and Time and NumDeleted. 6226 func (fs *fileStore) FastState(state *StreamState) { 6227 fs.mu.RLock() 6228 state.Msgs = fs.state.Msgs 6229 state.Bytes = fs.state.Bytes 6230 state.FirstSeq = fs.state.FirstSeq 6231 state.FirstTime = fs.state.FirstTime 6232 state.LastSeq = fs.state.LastSeq 6233 state.LastTime = fs.state.LastTime 6234 if state.LastSeq > state.FirstSeq { 6235 state.NumDeleted = int((state.LastSeq - state.FirstSeq + 1) - state.Msgs) 6236 if state.NumDeleted < 0 { 6237 state.NumDeleted = 0 6238 } 6239 } 6240 state.Consumers = fs.numConsumers() 6241 state.NumSubjects = fs.numSubjects() 6242 fs.mu.RUnlock() 6243 } 6244 6245 // State returns the current state of the stream. 6246 func (fs *fileStore) State() StreamState { 6247 fs.mu.RLock() 6248 state := fs.state 6249 state.Consumers = fs.numConsumers() 6250 state.NumSubjects = fs.numSubjects() 6251 state.Deleted = nil // make sure. 6252 6253 if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 { 6254 state.Deleted = make([]uint64, 0, numDeleted) 6255 cur := fs.state.FirstSeq 6256 6257 for _, mb := range fs.blks { 6258 mb.mu.Lock() 6259 fseq := atomic.LoadUint64(&mb.first.seq) 6260 // Account for messages missing from the head. 6261 if fseq > cur { 6262 for seq := cur; seq < fseq; seq++ { 6263 state.Deleted = append(state.Deleted, seq) 6264 } 6265 } 6266 cur = atomic.LoadUint64(&mb.last.seq) + 1 // Expected next first. 6267 6268 mb.dmap.Range(func(seq uint64) bool { 6269 if seq < fseq { 6270 mb.dmap.Delete(seq) 6271 } else { 6272 state.Deleted = append(state.Deleted, seq) 6273 } 6274 return true 6275 }) 6276 mb.mu.Unlock() 6277 } 6278 } 6279 fs.mu.RUnlock() 6280 6281 state.Lost = fs.lostData() 6282 6283 // Can not be guaranteed to be sorted. 6284 if len(state.Deleted) > 0 { 6285 sort.Slice(state.Deleted, func(i, j int) bool { 6286 return state.Deleted[i] < state.Deleted[j] 6287 }) 6288 state.NumDeleted = len(state.Deleted) 6289 } 6290 return state 6291 } 6292 6293 func (fs *fileStore) Utilization() (total, reported uint64, err error) { 6294 fs.mu.RLock() 6295 defer fs.mu.RUnlock() 6296 for _, mb := range fs.blks { 6297 mb.mu.RLock() 6298 reported += mb.bytes 6299 total += mb.rbytes 6300 mb.mu.RUnlock() 6301 } 6302 return total, reported, nil 6303 } 6304 6305 func fileStoreMsgSize(subj string, hdr, msg []byte) uint64 { 6306 if len(hdr) == 0 { 6307 // length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + msg + hash(8) 6308 return uint64(22 + len(subj) + len(msg) + 8) 6309 } 6310 // length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + hdr_len(4) + hdr + msg + hash(8) 6311 return uint64(22 + len(subj) + 4 + len(hdr) + len(msg) + 8) 6312 } 6313 6314 func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 { 6315 return uint64(emptyRecordLen + slen + 4 + maxPayload) 6316 } 6317 6318 // Determine time since any last activity, read/load, write or remove. 6319 func (mb *msgBlock) sinceLastActivity() time.Duration { 6320 if mb.closed { 6321 return 0 6322 } 6323 last := mb.lwts 6324 if mb.lrts > last { 6325 last = mb.lrts 6326 } 6327 if mb.llts > last { 6328 last = mb.llts 6329 } 6330 return time.Since(time.Unix(0, last).UTC()) 6331 } 6332 6333 // Determine time since last write or remove of a message. 6334 // Read lock should be held. 6335 func (mb *msgBlock) sinceLastWriteActivity() time.Duration { 6336 if mb.closed { 6337 return 0 6338 } 6339 last := mb.lwts 6340 if mb.lrts > last { 6341 last = mb.lrts 6342 } 6343 return time.Since(time.Unix(0, last).UTC()) 6344 } 6345 6346 func checkNewHeader(hdr []byte) error { 6347 if hdr == nil || len(hdr) < 2 || hdr[0] != magic || 6348 (hdr[1] != version && hdr[1] != newVersion) { 6349 return errCorruptState 6350 } 6351 return nil 6352 } 6353 6354 // readIndexInfo will read in the index information for the message block. 6355 func (mb *msgBlock) readIndexInfo() error { 6356 ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index)) 6357 buf, err := os.ReadFile(ifn) 6358 if err != nil { 6359 return err 6360 } 6361 6362 // Set if first time. 6363 if mb.liwsz == 0 { 6364 mb.liwsz = int64(len(buf)) 6365 } 6366 6367 // Decrypt if needed. 6368 if mb.aek != nil { 6369 buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil) 6370 if err != nil { 6371 return err 6372 } 6373 } 6374 6375 if err := checkNewHeader(buf); err != nil { 6376 defer os.Remove(ifn) 6377 return fmt.Errorf("bad index file") 6378 } 6379 6380 bi := hdrLen 6381 6382 // Helpers, will set i to -1 on error. 6383 readSeq := func() uint64 { 6384 if bi < 0 { 6385 return 0 6386 } 6387 seq, n := binary.Uvarint(buf[bi:]) 6388 if n <= 0 { 6389 bi = -1 6390 return 0 6391 } 6392 bi += n 6393 return seq &^ ebit 6394 } 6395 readCount := readSeq 6396 readTimeStamp := func() int64 { 6397 if bi < 0 { 6398 return 0 6399 } 6400 ts, n := binary.Varint(buf[bi:]) 6401 if n <= 0 { 6402 bi = -1 6403 return -1 6404 } 6405 bi += n 6406 return ts 6407 } 6408 mb.msgs = readCount() 6409 mb.bytes = readCount() 6410 atomic.StoreUint64(&mb.first.seq, readSeq()) 6411 mb.first.ts = readTimeStamp() 6412 atomic.StoreUint64(&mb.last.seq, readSeq()) 6413 mb.last.ts = readTimeStamp() 6414 dmapLen := readCount() 6415 6416 // Check if this is a short write index file. 6417 if bi < 0 || bi+checksumSize > len(buf) { 6418 os.Remove(ifn) 6419 return fmt.Errorf("short index file") 6420 } 6421 6422 // Check for consistency if accounting. If something is off bail and we will rebuild. 6423 if mb.msgs != (atomic.LoadUint64(&mb.last.seq)-atomic.LoadUint64(&mb.first.seq)+1)-dmapLen { 6424 os.Remove(ifn) 6425 return fmt.Errorf("accounting inconsistent") 6426 } 6427 6428 // Checksum 6429 copy(mb.lchk[0:], buf[bi:bi+checksumSize]) 6430 bi += checksumSize 6431 6432 // Now check for presence of a delete map 6433 if dmapLen > 0 { 6434 // New version is encoded avl seqset. 6435 if buf[1] == newVersion { 6436 dmap, _, err := avl.Decode(buf[bi:]) 6437 if err != nil { 6438 return fmt.Errorf("could not decode avl dmap: %v", err) 6439 } 6440 mb.dmap = *dmap 6441 } else { 6442 // This is the old version. 6443 for i, fseq := 0, atomic.LoadUint64(&mb.first.seq); i < int(dmapLen); i++ { 6444 seq := readSeq() 6445 if seq == 0 { 6446 break 6447 } 6448 mb.dmap.Insert(seq + fseq) 6449 } 6450 } 6451 } 6452 6453 return nil 6454 } 6455 6456 // Will return total number of cache loads. 6457 func (fs *fileStore) cacheLoads() uint64 { 6458 var tl uint64 6459 fs.mu.RLock() 6460 for _, mb := range fs.blks { 6461 tl += mb.cloads 6462 } 6463 fs.mu.RUnlock() 6464 return tl 6465 } 6466 6467 // Will return total number of cached bytes. 6468 func (fs *fileStore) cacheSize() uint64 { 6469 var sz uint64 6470 fs.mu.RLock() 6471 for _, mb := range fs.blks { 6472 mb.mu.RLock() 6473 if mb.cache != nil { 6474 sz += uint64(len(mb.cache.buf)) 6475 } 6476 mb.mu.RUnlock() 6477 } 6478 fs.mu.RUnlock() 6479 return sz 6480 } 6481 6482 // Will return total number of dmapEntries for all msg blocks. 6483 func (fs *fileStore) dmapEntries() int { 6484 var total int 6485 fs.mu.RLock() 6486 for _, mb := range fs.blks { 6487 total += mb.dmap.Size() 6488 } 6489 fs.mu.RUnlock() 6490 return total 6491 } 6492 6493 // Fixed helper for iterating. 6494 func subjectsEqual(a, b string) bool { 6495 return a == b 6496 } 6497 6498 func subjectsAll(a, b string) bool { 6499 return true 6500 } 6501 6502 func compareFn(subject string) func(string, string) bool { 6503 if subject == _EMPTY_ || subject == fwcs { 6504 return subjectsAll 6505 } 6506 if subjectHasWildcard(subject) { 6507 return subjectIsSubsetMatch 6508 } 6509 return subjectsEqual 6510 } 6511 6512 // PurgeEx will remove messages based on subject filters, sequence and number of messages to keep. 6513 // Will return the number of purged messages. 6514 func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) { 6515 if subject == _EMPTY_ || subject == fwcs { 6516 if keep == 0 && sequence == 0 { 6517 return fs.Purge() 6518 } 6519 if sequence > 1 { 6520 return fs.Compact(sequence) 6521 } 6522 } 6523 6524 eq, wc := compareFn(subject), subjectHasWildcard(subject) 6525 var firstSeqNeedsUpdate bool 6526 var bytes uint64 6527 6528 // If we have a "keep" designation need to get full filtered state so we know how many to purge. 6529 var maxp uint64 6530 if keep > 0 { 6531 ss := fs.FilteredState(1, subject) 6532 if keep >= ss.Msgs { 6533 return 0, nil 6534 } 6535 maxp = ss.Msgs - keep 6536 } 6537 6538 var smv StoreMsg 6539 6540 fs.mu.Lock() 6541 // We may remove blocks as we purge, so don't range directly on fs.blks 6542 // otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528) 6543 for i := 0; i < len(fs.blks); i++ { 6544 mb := fs.blks[i] 6545 mb.mu.Lock() 6546 6547 // If we do not have our fss, try to expire the cache if we have no items in this block. 6548 shouldExpire := mb.fssNotLoaded() 6549 6550 t, f, l := mb.filteredPendingLocked(subject, wc, atomic.LoadUint64(&mb.first.seq)) 6551 if t == 0 { 6552 // Expire if we were responsible for loading. 6553 if shouldExpire { 6554 // Expire this cache before moving on. 6555 mb.tryForceExpireCacheLocked() 6556 } 6557 mb.mu.Unlock() 6558 continue 6559 } 6560 6561 if sequence > 1 && sequence <= l { 6562 l = sequence - 1 6563 } 6564 6565 if mb.cacheNotLoaded() { 6566 mb.loadMsgsWithLock() 6567 shouldExpire = true 6568 } 6569 6570 for seq := f; seq <= l; seq++ { 6571 if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) { 6572 rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 6573 // Do fast in place remove. 6574 // Stats 6575 if mb.msgs > 0 { 6576 // Msgs 6577 fs.state.Msgs-- 6578 mb.msgs-- 6579 // Bytes, make sure to not go negative. 6580 if rl > fs.state.Bytes { 6581 rl = fs.state.Bytes 6582 } 6583 if rl > mb.bytes { 6584 rl = mb.bytes 6585 } 6586 fs.state.Bytes -= rl 6587 mb.bytes -= rl 6588 // Totals 6589 purged++ 6590 bytes += rl 6591 } 6592 // FSS updates. 6593 mb.removeSeqPerSubject(sm.subj, seq) 6594 fs.removePerSubject(sm.subj) 6595 6596 // Check for first message. 6597 if seq == atomic.LoadUint64(&mb.first.seq) { 6598 mb.selectNextFirst() 6599 if mb.isEmpty() { 6600 fs.removeMsgBlock(mb) 6601 i-- 6602 // keep flag set, if set previously 6603 firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq 6604 } else if seq == fs.state.FirstSeq { 6605 fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one. 6606 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 6607 } 6608 } else { 6609 // Out of order delete. 6610 mb.dmap.Insert(seq) 6611 } 6612 6613 if maxp > 0 && purged >= maxp { 6614 break 6615 } 6616 } 6617 } 6618 // Expire if we were responsible for loading. 6619 if shouldExpire { 6620 // Expire this cache before moving on. 6621 mb.tryForceExpireCacheLocked() 6622 } 6623 mb.mu.Unlock() 6624 6625 // Check if we should break out of top level too. 6626 if maxp > 0 && purged >= maxp { 6627 break 6628 } 6629 } 6630 if firstSeqNeedsUpdate { 6631 fs.selectNextFirst() 6632 } 6633 6634 fs.dirty++ 6635 cb := fs.scb 6636 fs.mu.Unlock() 6637 6638 fs.kickFlushStateLoop() 6639 6640 if cb != nil { 6641 cb(-int64(purged), -int64(bytes), 0, _EMPTY_) 6642 } 6643 6644 return purged, nil 6645 } 6646 6647 // Purge will remove all messages from this store. 6648 // Will return the number of purged messages. 6649 func (fs *fileStore) Purge() (uint64, error) { 6650 return fs.purge(0) 6651 } 6652 6653 func (fs *fileStore) purge(fseq uint64) (uint64, error) { 6654 fs.mu.Lock() 6655 if fs.closed { 6656 fs.mu.Unlock() 6657 return 0, ErrStoreClosed 6658 } 6659 6660 purged := fs.state.Msgs 6661 rbytes := int64(fs.state.Bytes) 6662 6663 fs.state.FirstSeq = fs.state.LastSeq + 1 6664 fs.state.FirstTime = time.Time{} 6665 6666 fs.state.Bytes = 0 6667 fs.state.Msgs = 0 6668 6669 for _, mb := range fs.blks { 6670 mb.dirtyClose() 6671 } 6672 6673 fs.blks = nil 6674 fs.lmb = nil 6675 fs.bim = make(map[uint32]*msgBlock) 6676 // Clear any per subject tracking. 6677 fs.psim, fs.tsl = fs.psim.Empty(), 0 6678 // Mark dirty 6679 fs.dirty++ 6680 6681 // Move the msgs directory out of the way, will delete out of band. 6682 // FIXME(dlc) - These can error and we need to change api above to propagate? 6683 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 6684 pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) 6685 // If purge directory still exists then we need to wait 6686 // in place and remove since rename would fail. 6687 if _, err := os.Stat(pdir); err == nil { 6688 <-dios 6689 os.RemoveAll(pdir) 6690 dios <- struct{}{} 6691 } 6692 6693 <-dios 6694 os.Rename(mdir, pdir) 6695 dios <- struct{}{} 6696 6697 go func() { 6698 <-dios 6699 os.RemoveAll(pdir) 6700 dios <- struct{}{} 6701 }() 6702 6703 // Create new one. 6704 <-dios 6705 os.MkdirAll(mdir, defaultDirPerms) 6706 dios <- struct{}{} 6707 6708 // Make sure we have a lmb to write to. 6709 if _, err := fs.newMsgBlockForWrite(); err != nil { 6710 fs.mu.Unlock() 6711 return purged, err 6712 } 6713 6714 // Check if we need to set the first seq to a new number. 6715 if fseq > fs.state.FirstSeq { 6716 fs.state.FirstSeq = fseq 6717 fs.state.LastSeq = fseq - 1 6718 } 6719 6720 lmb := fs.lmb 6721 atomic.StoreUint64(&lmb.first.seq, fs.state.FirstSeq) 6722 atomic.StoreUint64(&lmb.last.seq, fs.state.LastSeq) 6723 lmb.last.ts = fs.state.LastTime.UnixNano() 6724 6725 if lseq := atomic.LoadUint64(&lmb.last.seq); lseq > 1 { 6726 // Leave a tombstone so we can remember our starting sequence in case 6727 // full state becomes corrupted. 6728 lmb.writeTombstone(lseq, lmb.last.ts) 6729 } 6730 6731 cb := fs.scb 6732 fs.mu.Unlock() 6733 6734 if cb != nil { 6735 cb(-int64(purged), -rbytes, 0, _EMPTY_) 6736 } 6737 6738 return purged, nil 6739 } 6740 6741 // Compact will remove all messages from this store up to 6742 // but not including the seq parameter. 6743 // Will return the number of purged messages. 6744 func (fs *fileStore) Compact(seq uint64) (uint64, error) { 6745 if seq == 0 { 6746 return fs.purge(seq) 6747 } 6748 6749 var purged, bytes uint64 6750 6751 fs.mu.Lock() 6752 // Same as purge all. 6753 if lseq := fs.state.LastSeq; seq > lseq { 6754 fs.mu.Unlock() 6755 return fs.purge(seq) 6756 } 6757 // We have to delete interior messages. 6758 smb := fs.selectMsgBlock(seq) 6759 if smb == nil { 6760 fs.mu.Unlock() 6761 return 0, nil 6762 } 6763 6764 // All msgblocks up to this one can be thrown away. 6765 var deleted int 6766 for _, mb := range fs.blks { 6767 if mb == smb { 6768 break 6769 } 6770 mb.mu.Lock() 6771 purged += mb.msgs 6772 bytes += mb.bytes 6773 // Make sure we do subject cleanup as well. 6774 mb.ensurePerSubjectInfoLoaded() 6775 for subj, ss := range mb.fss { 6776 for i := uint64(0); i < ss.Msgs; i++ { 6777 fs.removePerSubject(subj) 6778 } 6779 } 6780 // Now close. 6781 mb.dirtyCloseWithRemove(true) 6782 mb.mu.Unlock() 6783 deleted++ 6784 } 6785 6786 var smv StoreMsg 6787 var err error 6788 var isEmpty bool 6789 6790 smb.mu.Lock() 6791 if atomic.LoadUint64(&smb.first.seq) == seq { 6792 fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq) 6793 fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC() 6794 goto SKIP 6795 } 6796 6797 // Make sure we have the messages loaded. 6798 if smb.cacheNotLoaded() { 6799 if err = smb.loadMsgsWithLock(); err != nil { 6800 goto SKIP 6801 } 6802 } 6803 for mseq := atomic.LoadUint64(&smb.first.seq); mseq < seq; mseq++ { 6804 sm, err := smb.cacheLookup(mseq, &smv) 6805 if err == errDeletedMsg { 6806 // Update dmap. 6807 if !smb.dmap.IsEmpty() { 6808 smb.dmap.Delete(seq) 6809 } 6810 } else if sm != nil { 6811 sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 6812 if smb.msgs > 0 { 6813 smb.msgs-- 6814 if sz > smb.bytes { 6815 sz = smb.bytes 6816 } 6817 smb.bytes -= sz 6818 bytes += sz 6819 purged++ 6820 } 6821 // Update fss 6822 smb.removeSeqPerSubject(sm.subj, mseq) 6823 fs.removePerSubject(sm.subj) 6824 } 6825 } 6826 6827 // Check if empty after processing, could happen if tail of messages are all deleted. 6828 isEmpty = smb.msgs == 0 6829 if isEmpty { 6830 smb.dirtyCloseWithRemove(true) 6831 // Update fs first here as well. 6832 fs.state.FirstSeq = atomic.LoadUint64(&smb.last.seq) + 1 6833 fs.state.FirstTime = time.Time{} 6834 deleted++ 6835 } else { 6836 // Make sure to sync changes. 6837 smb.needSync = true 6838 // Update fs first seq and time. 6839 atomic.StoreUint64(&smb.first.seq, seq-1) // Just for start condition for selectNextFirst. 6840 smb.selectNextFirst() 6841 6842 fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq) 6843 fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC() 6844 6845 // Check if we should reclaim the head space from this block. 6846 // This will be optimistic only, so don't continue if we encounter any errors here. 6847 if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes { 6848 var moff uint32 6849 moff, _, _, err = smb.slotInfo(int(atomic.LoadUint64(&smb.first.seq) - smb.cache.fseq)) 6850 if err != nil || moff >= uint32(len(smb.cache.buf)) { 6851 goto SKIP 6852 } 6853 buf := smb.cache.buf[moff:] 6854 // Don't reuse, copy to new recycled buf. 6855 nbuf := getMsgBlockBuf(len(buf)) 6856 nbuf = append(nbuf, buf...) 6857 smb.closeFDsLockedNoCheck() 6858 // Check for encryption. 6859 if smb.bek != nil && len(nbuf) > 0 { 6860 // Recreate to reset counter. 6861 bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce) 6862 if err != nil { 6863 goto SKIP 6864 } 6865 // For future writes make sure to set smb.bek to keep counter correct. 6866 smb.bek = bek 6867 smb.bek.XORKeyStream(nbuf, nbuf) 6868 } 6869 // Recompress if necessary (smb.cmp contains the algorithm used when 6870 // the block was loaded from disk, or defaults to NoCompression if not) 6871 if nbuf, err = smb.cmp.Compress(nbuf); err != nil { 6872 goto SKIP 6873 } 6874 <-dios 6875 err = os.WriteFile(smb.mfn, nbuf, defaultFilePerms) 6876 dios <- struct{}{} 6877 if err != nil { 6878 goto SKIP 6879 } 6880 // Make sure to remove fss state. 6881 smb.fss = nil 6882 smb.clearCacheAndOffset() 6883 smb.rbytes = uint64(len(nbuf)) 6884 } 6885 } 6886 6887 SKIP: 6888 smb.mu.Unlock() 6889 6890 if deleted > 0 { 6891 // Update block map. 6892 if fs.bim != nil { 6893 for _, mb := range fs.blks[:deleted] { 6894 delete(fs.bim, mb.index) 6895 } 6896 } 6897 // Update blks slice. 6898 fs.blks = copyMsgBlocks(fs.blks[deleted:]) 6899 if lb := len(fs.blks); lb == 0 { 6900 fs.lmb = nil 6901 } else { 6902 fs.lmb = fs.blks[lb-1] 6903 } 6904 } 6905 6906 // Update top level accounting. 6907 if purged > fs.state.Msgs { 6908 purged = fs.state.Msgs 6909 } 6910 fs.state.Msgs -= purged 6911 6912 if bytes > fs.state.Bytes { 6913 bytes = fs.state.Bytes 6914 } 6915 fs.state.Bytes -= bytes 6916 6917 fs.dirty++ 6918 fs.kickFlushStateLoop() 6919 6920 cb := fs.scb 6921 fs.mu.Unlock() 6922 6923 if cb != nil && purged > 0 { 6924 cb(-int64(purged), -int64(bytes), 0, _EMPTY_) 6925 } 6926 6927 return purged, err 6928 } 6929 6930 // Will completely reset our store. 6931 func (fs *fileStore) reset() error { 6932 fs.mu.Lock() 6933 if fs.closed { 6934 fs.mu.Unlock() 6935 return ErrStoreClosed 6936 } 6937 if fs.sips > 0 { 6938 fs.mu.Unlock() 6939 return ErrStoreSnapshotInProgress 6940 } 6941 6942 var purged, bytes uint64 6943 cb := fs.scb 6944 6945 for _, mb := range fs.blks { 6946 mb.mu.Lock() 6947 purged += mb.msgs 6948 bytes += mb.bytes 6949 mb.dirtyCloseWithRemove(true) 6950 mb.mu.Unlock() 6951 } 6952 6953 // Reset 6954 fs.state.FirstSeq = 0 6955 fs.state.FirstTime = time.Time{} 6956 fs.state.LastSeq = 0 6957 fs.state.LastTime = time.Now().UTC() 6958 // Update msgs and bytes. 6959 fs.state.Msgs = 0 6960 fs.state.Bytes = 0 6961 6962 // Reset blocks. 6963 fs.blks, fs.lmb = nil, nil 6964 6965 // Reset subject mappings. 6966 fs.psim, fs.tsl = fs.psim.Empty(), 0 6967 fs.bim = make(map[uint32]*msgBlock) 6968 6969 // If we purged anything, make sure we kick flush state loop. 6970 if purged > 0 { 6971 fs.dirty++ 6972 fs.kickFlushStateLoop() 6973 } 6974 6975 fs.mu.Unlock() 6976 6977 if cb != nil { 6978 cb(-int64(purged), -int64(bytes), 0, _EMPTY_) 6979 } 6980 6981 return nil 6982 } 6983 6984 // Truncate will truncate a stream store up to seq. Sequence needs to be valid. 6985 func (fs *fileStore) Truncate(seq uint64) error { 6986 // Check for request to reset. 6987 if seq == 0 { 6988 return fs.reset() 6989 } 6990 6991 fs.mu.Lock() 6992 6993 if fs.closed { 6994 fs.mu.Unlock() 6995 return ErrStoreClosed 6996 } 6997 if fs.sips > 0 { 6998 fs.mu.Unlock() 6999 return ErrStoreSnapshotInProgress 7000 } 7001 7002 nlmb := fs.selectMsgBlock(seq) 7003 if nlmb == nil { 7004 fs.mu.Unlock() 7005 return ErrInvalidSequence 7006 } 7007 lsm, _, _ := nlmb.fetchMsg(seq, nil) 7008 if lsm == nil { 7009 fs.mu.Unlock() 7010 return ErrInvalidSequence 7011 } 7012 7013 // Set lmb to nlmb and make sure writeable. 7014 fs.lmb = nlmb 7015 if err := nlmb.enableForWriting(fs.fip); err != nil { 7016 return err 7017 } 7018 7019 var purged, bytes uint64 7020 7021 // Truncate our new last message block. 7022 nmsgs, nbytes, err := nlmb.truncate(lsm) 7023 if err != nil { 7024 fs.mu.Unlock() 7025 return fmt.Errorf("nlmb.truncate: %w", err) 7026 } 7027 // Account for the truncated msgs and bytes. 7028 purged += nmsgs 7029 bytes += nbytes 7030 7031 // Remove any left over msg blocks. 7032 getLastMsgBlock := func() *msgBlock { return fs.blks[len(fs.blks)-1] } 7033 for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() { 7034 mb.mu.Lock() 7035 purged += mb.msgs 7036 bytes += mb.bytes 7037 fs.removeMsgBlock(mb) 7038 mb.mu.Unlock() 7039 } 7040 7041 // Reset last. 7042 fs.state.LastSeq = lsm.seq 7043 fs.state.LastTime = time.Unix(0, lsm.ts).UTC() 7044 // Update msgs and bytes. 7045 if purged > fs.state.Msgs { 7046 purged = fs.state.Msgs 7047 } 7048 fs.state.Msgs -= purged 7049 if bytes > fs.state.Bytes { 7050 bytes = fs.state.Bytes 7051 } 7052 fs.state.Bytes -= bytes 7053 7054 // Reset our subject lookup info. 7055 fs.resetGlobalPerSubjectInfo() 7056 7057 fs.dirty++ 7058 fs.kickFlushStateLoop() 7059 7060 cb := fs.scb 7061 fs.mu.Unlock() 7062 7063 if cb != nil { 7064 cb(-int64(purged), -int64(bytes), 0, _EMPTY_) 7065 } 7066 7067 return nil 7068 } 7069 7070 func (fs *fileStore) lastSeq() uint64 { 7071 fs.mu.RLock() 7072 seq := fs.state.LastSeq 7073 fs.mu.RUnlock() 7074 return seq 7075 } 7076 7077 // Returns number of msg blks. 7078 func (fs *fileStore) numMsgBlocks() int { 7079 fs.mu.RLock() 7080 defer fs.mu.RUnlock() 7081 return len(fs.blks) 7082 } 7083 7084 // Will add a new msgBlock. 7085 // Lock should be held. 7086 func (fs *fileStore) addMsgBlock(mb *msgBlock) { 7087 fs.blks = append(fs.blks, mb) 7088 fs.lmb = mb 7089 fs.bim[mb.index] = mb 7090 } 7091 7092 // Remove from our list of blks. 7093 // Both locks should be held. 7094 func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) { 7095 // Remove from list. 7096 for i, omb := range fs.blks { 7097 if mb == omb { 7098 fs.dirty++ 7099 blks := append(fs.blks[:i], fs.blks[i+1:]...) 7100 fs.blks = copyMsgBlocks(blks) 7101 if fs.bim != nil { 7102 delete(fs.bim, mb.index) 7103 } 7104 break 7105 } 7106 } 7107 } 7108 7109 // Removes the msgBlock 7110 // Both locks should be held. 7111 func (fs *fileStore) removeMsgBlock(mb *msgBlock) { 7112 mb.dirtyCloseWithRemove(true) 7113 fs.removeMsgBlockFromList(mb) 7114 // Check for us being last message block 7115 if mb == fs.lmb { 7116 lseq, lts := atomic.LoadUint64(&mb.last.seq), mb.last.ts 7117 // Creating a new message write block requires that the lmb lock is not held. 7118 mb.mu.Unlock() 7119 // Write the tombstone to remember since this was last block. 7120 if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { 7121 lmb.writeTombstone(lseq, lts) 7122 } 7123 mb.mu.Lock() 7124 } 7125 } 7126 7127 // Called by purge to simply get rid of the cache and close our fds. 7128 // Lock should not be held. 7129 func (mb *msgBlock) dirtyClose() { 7130 mb.mu.Lock() 7131 defer mb.mu.Unlock() 7132 mb.dirtyCloseWithRemove(false) 7133 } 7134 7135 // Should be called with lock held. 7136 func (mb *msgBlock) dirtyCloseWithRemove(remove bool) { 7137 if mb == nil { 7138 return 7139 } 7140 // Stop cache expiration timer. 7141 if mb.ctmr != nil { 7142 mb.ctmr.Stop() 7143 mb.ctmr = nil 7144 } 7145 // Clear any tracking by subject. 7146 mb.fss = nil 7147 // Close cache 7148 mb.clearCacheAndOffset() 7149 // Quit our loops. 7150 if mb.qch != nil { 7151 close(mb.qch) 7152 mb.qch = nil 7153 } 7154 if mb.mfd != nil { 7155 mb.mfd.Close() 7156 mb.mfd = nil 7157 } 7158 if remove { 7159 if mb.mfn != _EMPTY_ { 7160 os.Remove(mb.mfn) 7161 mb.mfn = _EMPTY_ 7162 } 7163 if mb.kfn != _EMPTY_ { 7164 os.Remove(mb.kfn) 7165 } 7166 // Since we are removing a block kick the state flusher. 7167 mb.fs.kickFlushStateLoop() 7168 } 7169 } 7170 7171 // Remove a seq from the fss and select new first. 7172 // Lock should be held. 7173 func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) { 7174 mb.ensurePerSubjectInfoLoaded() 7175 ss := mb.fss[subj] 7176 if ss == nil { 7177 return 7178 } 7179 7180 if ss.Msgs == 1 { 7181 delete(mb.fss, subj) 7182 return 7183 } 7184 7185 ss.Msgs-- 7186 7187 // Only one left. 7188 if ss.Msgs == 1 { 7189 if seq == ss.Last { 7190 ss.Last = ss.First 7191 } else { 7192 ss.First = ss.Last 7193 } 7194 ss.firstNeedsUpdate = false 7195 return 7196 } 7197 7198 // We can lazily calculate the first sequence when needed. 7199 ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate 7200 } 7201 7202 // Will recalulate the first sequence for this subject in this block. 7203 // Will avoid slower path message lookups and scan the cache directly instead. 7204 func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) { 7205 // Need to make sure messages are loaded. 7206 if mb.cacheNotLoaded() { 7207 if err := mb.loadMsgsWithLock(); err != nil { 7208 return 7209 } 7210 } 7211 7212 // Mark first as updated. 7213 ss.firstNeedsUpdate = false 7214 startSeq++ 7215 7216 startSlot := int(startSeq - mb.cache.fseq) 7217 if startSlot >= len(mb.cache.idx) { 7218 ss.First = ss.Last 7219 return 7220 } else if startSlot < 0 { 7221 startSlot = 0 7222 } 7223 7224 var le = binary.LittleEndian 7225 for slot, fseq := startSlot, atomic.LoadUint64(&mb.first.seq); slot < len(mb.cache.idx); slot++ { 7226 bi := mb.cache.idx[slot] &^ hbit 7227 if bi == dbit { 7228 // delete marker so skip. 7229 continue 7230 } 7231 li := int(bi) - mb.cache.off 7232 if li >= len(mb.cache.buf) { 7233 ss.First = ss.Last 7234 return 7235 } 7236 buf := mb.cache.buf[li:] 7237 hdr := buf[:msgHdrSize] 7238 slen := int(le.Uint16(hdr[20:])) 7239 if subj == bytesToString(buf[msgHdrSize:msgHdrSize+slen]) { 7240 seq := le.Uint64(hdr[4:]) 7241 if seq < fseq || seq&ebit != 0 || mb.dmap.Exists(seq) { 7242 continue 7243 } 7244 ss.First = seq 7245 return 7246 } 7247 } 7248 } 7249 7250 // Lock should be held. 7251 func (fs *fileStore) resetGlobalPerSubjectInfo() { 7252 // Clear any global subject state. 7253 fs.psim, fs.tsl = fs.psim.Empty(), 0 7254 for _, mb := range fs.blks { 7255 fs.populateGlobalPerSubjectInfo(mb) 7256 } 7257 } 7258 7259 // Lock should be held. 7260 func (mb *msgBlock) resetPerSubjectInfo() error { 7261 mb.fss = nil 7262 return mb.generatePerSubjectInfo() 7263 } 7264 7265 // generatePerSubjectInfo will generate the per subject info via the raw msg block. 7266 // Lock should be held. 7267 func (mb *msgBlock) generatePerSubjectInfo() error { 7268 // Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info. 7269 if mb.msgs == 0 { 7270 return nil 7271 } 7272 7273 if mb.cacheNotLoaded() { 7274 if err := mb.loadMsgsWithLock(); err != nil { 7275 return err 7276 } 7277 // indexCacheBuf can produce fss now, so if non-nil we are good. 7278 if mb.fss != nil { 7279 return nil 7280 } 7281 } 7282 7283 // Create new one regardless. 7284 mb.fss = make(map[string]*SimpleState) 7285 7286 var smv StoreMsg 7287 fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 7288 for seq := fseq; seq <= lseq; seq++ { 7289 sm, err := mb.cacheLookup(seq, &smv) 7290 if err != nil { 7291 // Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state. 7292 if err == ErrStoreMsgNotFound || err == errDeletedMsg { 7293 continue 7294 } 7295 if err == errNoCache { 7296 return nil 7297 } 7298 return err 7299 } 7300 if sm != nil && len(sm.subj) > 0 { 7301 if ss := mb.fss[sm.subj]; ss != nil { 7302 ss.Msgs++ 7303 ss.Last = seq 7304 } else { 7305 mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} 7306 } 7307 } 7308 } 7309 7310 if len(mb.fss) > 0 { 7311 // Make sure we run the cache expire timer. 7312 mb.llts = time.Now().UnixNano() 7313 mb.startCacheExpireTimer() 7314 } 7315 return nil 7316 } 7317 7318 // Helper to make sure fss loaded if we are tracking. 7319 // Lock should be held 7320 func (mb *msgBlock) ensurePerSubjectInfoLoaded() error { 7321 if mb.fss != nil || mb.noTrack { 7322 return nil 7323 } 7324 if mb.msgs == 0 { 7325 mb.fss = make(map[string]*SimpleState) 7326 return nil 7327 } 7328 return mb.generatePerSubjectInfo() 7329 } 7330 7331 // Called on recovery to populate the global psim state. 7332 // Lock should be held. 7333 func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { 7334 mb.mu.Lock() 7335 defer mb.mu.Unlock() 7336 7337 if err := mb.ensurePerSubjectInfoLoaded(); err != nil { 7338 return 7339 } 7340 7341 // Now populate psim. 7342 for subj, ss := range mb.fss { 7343 if len(subj) > 0 { 7344 bsubj := stringToBytes(subj) 7345 if info, ok := fs.psim.Find(bsubj); ok { 7346 info.total += ss.Msgs 7347 if mb.index > info.lblk { 7348 info.lblk = mb.index 7349 } 7350 } else { 7351 fs.psim.Insert(bsubj, psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index}) 7352 fs.tsl += len(subj) 7353 } 7354 } 7355 } 7356 } 7357 7358 // Close the message block. 7359 func (mb *msgBlock) close(sync bool) { 7360 if mb == nil { 7361 return 7362 } 7363 mb.mu.Lock() 7364 defer mb.mu.Unlock() 7365 7366 if mb.closed { 7367 return 7368 } 7369 7370 // Stop cache expiration timer. 7371 if mb.ctmr != nil { 7372 mb.ctmr.Stop() 7373 mb.ctmr = nil 7374 } 7375 7376 // Clear fss. 7377 mb.fss = nil 7378 7379 // Close cache 7380 mb.clearCacheAndOffset() 7381 // Quit our loops. 7382 if mb.qch != nil { 7383 close(mb.qch) 7384 mb.qch = nil 7385 } 7386 if mb.mfd != nil { 7387 if sync { 7388 mb.mfd.Sync() 7389 } 7390 mb.mfd.Close() 7391 } 7392 mb.mfd = nil 7393 // Mark as closed. 7394 mb.closed = true 7395 } 7396 7397 func (fs *fileStore) closeAllMsgBlocks(sync bool) { 7398 for _, mb := range fs.blks { 7399 mb.close(sync) 7400 } 7401 } 7402 7403 func (fs *fileStore) Delete() error { 7404 if fs.isClosed() { 7405 // Always attempt to remove since we could have been closed beforehand. 7406 os.RemoveAll(fs.fcfg.StoreDir) 7407 // Since we did remove, if we did have anything remaining make sure to 7408 // call into any storage updates that had been registered. 7409 fs.mu.Lock() 7410 cb, msgs, bytes := fs.scb, int64(fs.state.Msgs), int64(fs.state.Bytes) 7411 // Guard against double accounting if called twice. 7412 fs.state.Msgs, fs.state.Bytes = 0, 0 7413 fs.mu.Unlock() 7414 if msgs > 0 && cb != nil { 7415 cb(-msgs, -bytes, 0, _EMPTY_) 7416 } 7417 return ErrStoreClosed 7418 } 7419 7420 pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) 7421 // If purge directory still exists then we need to wait 7422 // in place and remove since rename would fail. 7423 if _, err := os.Stat(pdir); err == nil { 7424 os.RemoveAll(pdir) 7425 } 7426 7427 // Do Purge() since if we have lots of blocks uses a mv/rename. 7428 fs.Purge() 7429 7430 if err := fs.stop(false); err != nil { 7431 return err 7432 } 7433 7434 // Make sure we will not try to recover if killed before removal below completes. 7435 if err := os.Remove(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)); err != nil { 7436 return err 7437 } 7438 // Now move into different directory with "." prefix. 7439 ndir := filepath.Join(filepath.Dir(fs.fcfg.StoreDir), tsep+filepath.Base(fs.fcfg.StoreDir)) 7440 if err := os.Rename(fs.fcfg.StoreDir, ndir); err != nil { 7441 return err 7442 } 7443 // Do this in separate Go routine in case lots of blocks. 7444 // Purge above protects us as does the removal of meta artifacts above. 7445 go func() { 7446 err := os.RemoveAll(ndir) 7447 if err == nil { 7448 return 7449 } 7450 ttl := time.Now().Add(time.Second) 7451 for time.Now().Before(ttl) { 7452 time.Sleep(10 * time.Millisecond) 7453 if err = os.RemoveAll(ndir); err == nil { 7454 return 7455 } 7456 } 7457 }() 7458 7459 return nil 7460 } 7461 7462 // Lock should be held. 7463 func (fs *fileStore) setSyncTimer() { 7464 if fs.syncTmr != nil { 7465 fs.syncTmr.Reset(fs.fcfg.SyncInterval) 7466 } else { 7467 fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks) 7468 } 7469 } 7470 7471 // Lock should be held. 7472 func (fs *fileStore) cancelSyncTimer() { 7473 if fs.syncTmr != nil { 7474 fs.syncTmr.Stop() 7475 fs.syncTmr = nil 7476 } 7477 } 7478 7479 const ( 7480 fullStateMagic = uint8(11) 7481 fullStateVersion = uint8(1) 7482 ) 7483 7484 // This go routine runs and receives kicks to write out our full stream state index. 7485 // This will get kicked when we create a new block or when we delete a block in general. 7486 // This is also called during Stop(). 7487 func (fs *fileStore) flushStreamStateLoop(fch, qch, done chan struct{}) { 7488 // Make sure we do not try to write these out too fast. 7489 const writeThreshold = time.Minute 7490 lastWrite := time.Time{} 7491 7492 // We will use these to complete the full state write while not doing them too fast. 7493 var dt *time.Timer 7494 var dtc <-chan time.Time 7495 7496 defer close(done) 7497 7498 for { 7499 select { 7500 case <-fch: 7501 if elapsed := time.Since(lastWrite); elapsed > writeThreshold { 7502 fs.writeFullState() 7503 lastWrite = time.Now() 7504 if dt != nil { 7505 dt.Stop() 7506 dt, dtc = nil, nil 7507 } 7508 } else if dtc == nil { 7509 fireIn := time.Until(lastWrite.Add(writeThreshold)) 7510 if fireIn < 0 { 7511 fireIn = 100 * time.Millisecond 7512 } 7513 dt = time.NewTimer(fireIn) 7514 dtc = dt.C 7515 } 7516 case <-dtc: 7517 fs.writeFullState() 7518 lastWrite = time.Now() 7519 dt, dtc = nil, nil 7520 case <-qch: 7521 return 7522 } 7523 } 7524 } 7525 7526 // Kick the flusher. 7527 func (fs *fileStore) kickFlushStateLoop() { 7528 kickFlusher(fs.fch) 7529 } 7530 7531 // Helper since unixnano of zero time undefined. 7532 func timestampNormalized(t time.Time) int64 { 7533 if t.IsZero() { 7534 return 0 7535 } 7536 return t.UnixNano() 7537 } 7538 7539 // This will write the full binary state for the stream. 7540 // This plus everything new since last hash will be the total recovered state. 7541 // This state dump will have the following. 7542 // 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp) 7543 // 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present. 7544 // 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset). 7545 // 4. Last block index and hash of record inclusive to this stream state. 7546 func (fs *fileStore) writeFullState() error { 7547 fs.mu.Lock() 7548 if fs.closed || fs.dirty == 0 { 7549 fs.mu.Unlock() 7550 return nil 7551 } 7552 7553 // We track this through subsequent runs to get an avg per blk used for subsequent runs. 7554 avgDmapLen := fs.adml 7555 // If first time through could be 0 7556 if avgDmapLen == 0 && ((fs.state.LastSeq-fs.state.FirstSeq+1)-fs.state.Msgs) > 0 { 7557 avgDmapLen = 1024 7558 } 7559 7560 // For calculating size. 7561 numSubjects := fs.psim.Size() 7562 7563 // Calculate and estimate of the uper bound on the size to avoid multiple allocations. 7564 sz := 2 + // Magic and Version 7565 (binary.MaxVarintLen64 * 6) + // FS data 7566 binary.MaxVarintLen64 + fs.tsl + // NumSubjects + total subject length 7567 numSubjects*(binary.MaxVarintLen64*4) + // psi record 7568 binary.MaxVarintLen64 + // Num blocks. 7569 len(fs.blks)*((binary.MaxVarintLen64*7)+avgDmapLen) + // msg blocks, avgDmapLen is est for dmaps 7570 binary.MaxVarintLen64 + 8 + 8 // last index + record checksum + full state checksum 7571 7572 // Do 4k on stack if possible. 7573 const ssz = 4 * 1024 7574 var buf []byte 7575 7576 if sz <= ssz { 7577 var _buf [ssz]byte 7578 buf, sz = _buf[0:2:ssz], ssz 7579 } else { 7580 buf = make([]byte, hdrLen, sz) 7581 } 7582 7583 buf[0], buf[1] = fullStateMagic, fullStateVersion 7584 buf = binary.AppendUvarint(buf, fs.state.Msgs) 7585 buf = binary.AppendUvarint(buf, fs.state.Bytes) 7586 buf = binary.AppendUvarint(buf, fs.state.FirstSeq) 7587 buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime)) 7588 buf = binary.AppendUvarint(buf, fs.state.LastSeq) 7589 buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime)) 7590 7591 // Do per subject information map if applicable. 7592 buf = binary.AppendUvarint(buf, uint64(numSubjects)) 7593 if numSubjects > 0 { 7594 fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) { 7595 buf = binary.AppendUvarint(buf, uint64(len(subj))) 7596 buf = append(buf, subj...) 7597 buf = binary.AppendUvarint(buf, psi.total) 7598 buf = binary.AppendUvarint(buf, uint64(psi.fblk)) 7599 if psi.total > 1 { 7600 buf = binary.AppendUvarint(buf, uint64(psi.lblk)) 7601 } 7602 }) 7603 } 7604 7605 // Now walk all blocks and write out first and last and optional dmap encoding. 7606 var lbi uint32 7607 var lchk [8]byte 7608 7609 nb := len(fs.blks) 7610 buf = binary.AppendUvarint(buf, uint64(nb)) 7611 7612 // Use basetime to save some space. 7613 baseTime := timestampNormalized(fs.state.FirstTime) 7614 var scratch [8 * 1024]byte 7615 7616 // Track the state as represented by the mbs. 7617 var mstate StreamState 7618 7619 var dmapTotalLen int 7620 for _, mb := range fs.blks { 7621 mb.mu.RLock() 7622 buf = binary.AppendUvarint(buf, uint64(mb.index)) 7623 buf = binary.AppendUvarint(buf, mb.bytes) 7624 buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.first.seq)) 7625 buf = binary.AppendVarint(buf, mb.first.ts-baseTime) 7626 buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.last.seq)) 7627 buf = binary.AppendVarint(buf, mb.last.ts-baseTime) 7628 7629 numDeleted := mb.dmap.Size() 7630 buf = binary.AppendUvarint(buf, uint64(numDeleted)) 7631 if numDeleted > 0 { 7632 dmap, _ := mb.dmap.Encode(scratch[:0]) 7633 dmapTotalLen += len(dmap) 7634 buf = append(buf, dmap...) 7635 } 7636 // If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index. 7637 // We use this to quickly open this file on recovery. 7638 if mb == fs.lmb { 7639 lbi = mb.index 7640 mb.ensureLastChecksumLoaded() 7641 copy(lchk[0:], mb.lchk[:]) 7642 } 7643 updateTrackingState(&mstate, mb) 7644 mb.mu.RUnlock() 7645 } 7646 if dmapTotalLen > 0 { 7647 fs.adml = dmapTotalLen / len(fs.blks) 7648 } 7649 7650 // Place block index and hash onto the end. 7651 buf = binary.AppendUvarint(buf, uint64(lbi)) 7652 buf = append(buf, lchk[:]...) 7653 7654 // Encrypt if needed. 7655 if fs.prf != nil { 7656 if err := fs.setupAEK(); err != nil { 7657 fs.mu.Unlock() 7658 return err 7659 } 7660 nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead()) 7661 rand.Read(nonce) 7662 buf = fs.aek.Seal(nonce, nonce, buf, nil) 7663 } 7664 7665 fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) 7666 7667 fs.hh.Reset() 7668 fs.hh.Write(buf) 7669 buf = fs.hh.Sum(buf) 7670 7671 // Snapshot prior dirty count. 7672 priorDirty := fs.dirty 7673 7674 // Check tracking state. 7675 statesEqual := trackingStatesEqual(&fs.state, &mstate) 7676 // Release lock. 7677 fs.mu.Unlock() 7678 7679 // Check consistency here. 7680 if !statesEqual { 7681 fs.warn("Stream state encountered internal inconsistency on write") 7682 // Rebuild our fs state from the mb state. 7683 fs.rebuildState(nil) 7684 // Make sure to reprocess. 7685 fs.kickFlushStateLoop() 7686 return errCorruptState 7687 } 7688 7689 if cap(buf) > sz { 7690 fs.debug("WriteFullState reallocated from %d to %d", sz, cap(buf)) 7691 } 7692 7693 // Write to a tmp file and rename. 7694 const tmpPre = streamStreamStateFile + tsep 7695 f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre) 7696 if err != nil { 7697 return err 7698 } 7699 tmpName := f.Name() 7700 defer os.Remove(tmpName) 7701 if _, err = f.Write(buf); err == nil && fs.fcfg.SyncAlways { 7702 f.Sync() 7703 } 7704 f.Close() 7705 if err != nil { 7706 return err 7707 } 7708 7709 // Rename into position under our lock, clear prior dirty pending on success. 7710 fs.mu.Lock() 7711 if !fs.closed { 7712 if err := os.Rename(tmpName, fn); err != nil { 7713 fs.mu.Unlock() 7714 return err 7715 } 7716 fs.dirty -= priorDirty 7717 } 7718 fs.mu.Unlock() 7719 7720 return nil 7721 } 7722 7723 // Stop the current filestore. 7724 func (fs *fileStore) Stop() error { 7725 return fs.stop(true) 7726 } 7727 7728 // Stop the current filestore. 7729 func (fs *fileStore) stop(writeState bool) error { 7730 fs.mu.Lock() 7731 if fs.closed || fs.closing { 7732 fs.mu.Unlock() 7733 return ErrStoreClosed 7734 } 7735 7736 // Mark as closing. Do before releasing the lock to writeFullState 7737 // so we don't end up with this function running more than once. 7738 fs.closing = true 7739 7740 if writeState { 7741 fs.checkAndFlushAllBlocks() 7742 } 7743 fs.closeAllMsgBlocks(false) 7744 7745 fs.cancelSyncTimer() 7746 fs.cancelAgeChk() 7747 7748 // Release the state flusher loop. 7749 if fs.qch != nil { 7750 close(fs.qch) 7751 fs.qch = nil 7752 } 7753 7754 if writeState { 7755 // Wait for the state flush loop to exit. 7756 fsld := fs.fsld 7757 fs.mu.Unlock() 7758 <-fsld 7759 // Write full state if needed. If not dirty this is a no-op. 7760 fs.writeFullState() 7761 fs.mu.Lock() 7762 } 7763 7764 // Mark as closed. Last message block needs to be cleared after 7765 // writeFullState has completed. 7766 fs.closed = true 7767 fs.lmb = nil 7768 7769 // We should update the upper usage layer on a stop. 7770 cb, bytes := fs.scb, int64(fs.state.Bytes) 7771 fs.mu.Unlock() 7772 7773 fs.cmu.Lock() 7774 var _cfs [256]ConsumerStore 7775 cfs := append(_cfs[:0], fs.cfs...) 7776 fs.cfs = nil 7777 fs.cmu.Unlock() 7778 7779 for _, o := range cfs { 7780 o.Stop() 7781 } 7782 7783 if bytes > 0 && cb != nil { 7784 cb(0, -bytes, 0, _EMPTY_) 7785 } 7786 7787 return nil 7788 } 7789 7790 const errFile = "errors.txt" 7791 7792 // Stream our snapshot through S2 compression and tar. 7793 func (fs *fileStore) streamSnapshot(w io.WriteCloser, state *StreamState, includeConsumers bool) { 7794 defer w.Close() 7795 7796 enc := s2.NewWriter(w) 7797 defer enc.Close() 7798 7799 tw := tar.NewWriter(enc) 7800 defer tw.Close() 7801 7802 defer func() { 7803 fs.mu.Lock() 7804 fs.sips-- 7805 fs.mu.Unlock() 7806 }() 7807 7808 modTime := time.Now().UTC() 7809 7810 writeFile := func(name string, buf []byte) error { 7811 hdr := &tar.Header{ 7812 Name: name, 7813 Mode: 0600, 7814 ModTime: modTime, 7815 Uname: "nats", 7816 Gname: "nats", 7817 Size: int64(len(buf)), 7818 Format: tar.FormatPAX, 7819 } 7820 if err := tw.WriteHeader(hdr); err != nil { 7821 return err 7822 } 7823 if _, err := tw.Write(buf); err != nil { 7824 return err 7825 } 7826 return nil 7827 } 7828 7829 writeErr := func(err string) { 7830 writeFile(errFile, []byte(err)) 7831 } 7832 7833 fs.mu.Lock() 7834 blks := fs.blks 7835 // Grab our general meta data. 7836 // We do this now instead of pulling from files since they could be encrypted. 7837 meta, err := json.Marshal(fs.cfg) 7838 if err != nil { 7839 fs.mu.Unlock() 7840 writeErr(fmt.Sprintf("Could not gather stream meta file: %v", err)) 7841 return 7842 } 7843 hh := fs.hh 7844 hh.Reset() 7845 hh.Write(meta) 7846 sum := []byte(hex.EncodeToString(fs.hh.Sum(nil))) 7847 fs.mu.Unlock() 7848 7849 // Meta first. 7850 if writeFile(JetStreamMetaFile, meta) != nil { 7851 return 7852 } 7853 if writeFile(JetStreamMetaFileSum, sum) != nil { 7854 return 7855 } 7856 7857 // Can't use join path here, tar only recognizes relative paths with forward slashes. 7858 msgPre := msgDir + "/" 7859 var bbuf []byte 7860 7861 const minLen = 32 7862 sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) 7863 if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen { 7864 if fs.aek != nil { 7865 ns := fs.aek.NonceSize() 7866 buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil) 7867 if err == nil { 7868 // Redo hash checksum at end on plaintext. 7869 fs.mu.Lock() 7870 hh.Reset() 7871 hh.Write(buf) 7872 buf = fs.hh.Sum(buf) 7873 fs.mu.Unlock() 7874 } 7875 } 7876 if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil { 7877 return 7878 } 7879 } 7880 7881 // Now do messages themselves. 7882 for _, mb := range blks { 7883 if mb.pendingWriteSize() > 0 { 7884 mb.flushPendingMsgs() 7885 } 7886 mb.mu.Lock() 7887 // We could stream but don't want to hold the lock and prevent changes, so just read in and 7888 // release the lock for now. 7889 bbuf, err = mb.loadBlock(bbuf) 7890 if err != nil { 7891 mb.mu.Unlock() 7892 writeErr(fmt.Sprintf("Could not read message block [%d]: %v", mb.index, err)) 7893 return 7894 } 7895 // Check for encryption. 7896 if mb.bek != nil && len(bbuf) > 0 { 7897 rbek, err := genBlockEncryptionKey(fs.fcfg.Cipher, mb.seed, mb.nonce) 7898 if err != nil { 7899 mb.mu.Unlock() 7900 writeErr(fmt.Sprintf("Could not create encryption key for message block [%d]: %v", mb.index, err)) 7901 return 7902 } 7903 rbek.XORKeyStream(bbuf, bbuf) 7904 } 7905 // Check for compression. 7906 if bbuf, err = mb.decompressIfNeeded(bbuf); err != nil { 7907 mb.mu.Unlock() 7908 writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err)) 7909 return 7910 } 7911 mb.mu.Unlock() 7912 7913 // Do this one unlocked. 7914 if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil { 7915 return 7916 } 7917 } 7918 7919 // Bail if no consumers requested. 7920 if !includeConsumers { 7921 return 7922 } 7923 7924 // Do consumers' state last. 7925 fs.cmu.RLock() 7926 cfs := fs.cfs 7927 fs.cmu.RUnlock() 7928 7929 for _, cs := range cfs { 7930 o, ok := cs.(*consumerFileStore) 7931 if !ok { 7932 continue 7933 } 7934 o.mu.Lock() 7935 // Grab our general meta data. 7936 // We do this now instead of pulling from files since they could be encrypted. 7937 meta, err := json.Marshal(o.cfg) 7938 if err != nil { 7939 o.mu.Unlock() 7940 writeErr(fmt.Sprintf("Could not gather consumer meta file for %q: %v", o.name, err)) 7941 return 7942 } 7943 o.hh.Reset() 7944 o.hh.Write(meta) 7945 sum := []byte(hex.EncodeToString(o.hh.Sum(nil))) 7946 7947 // We can have the running state directly encoded now. 7948 state, err := o.encodeState() 7949 if err != nil { 7950 o.mu.Unlock() 7951 writeErr(fmt.Sprintf("Could not encode consumer state for %q: %v", o.name, err)) 7952 return 7953 } 7954 odirPre := filepath.Join(consumerDir, o.name) 7955 o.mu.Unlock() 7956 7957 // Write all the consumer files. 7958 if writeFile(filepath.Join(odirPre, JetStreamMetaFile), meta) != nil { 7959 return 7960 } 7961 if writeFile(filepath.Join(odirPre, JetStreamMetaFileSum), sum) != nil { 7962 return 7963 } 7964 writeFile(filepath.Join(odirPre, consumerState), state) 7965 } 7966 } 7967 7968 // Create a snapshot of this stream and its consumer's state along with messages. 7969 func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) { 7970 fs.mu.Lock() 7971 if fs.closed { 7972 fs.mu.Unlock() 7973 return nil, ErrStoreClosed 7974 } 7975 // Only allow one at a time. 7976 if fs.sips > 0 { 7977 fs.mu.Unlock() 7978 return nil, ErrStoreSnapshotInProgress 7979 } 7980 // Mark us as snapshotting 7981 fs.sips += 1 7982 fs.mu.Unlock() 7983 7984 if checkMsgs { 7985 ld := fs.checkMsgs() 7986 if ld != nil && len(ld.Msgs) > 0 { 7987 return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs)) 7988 } 7989 } 7990 7991 // Write out full state as well before proceeding. 7992 fs.writeFullState() 7993 7994 pr, pw := net.Pipe() 7995 7996 // Set a write deadline here to protect ourselves. 7997 if deadline > 0 { 7998 pw.SetWriteDeadline(time.Now().Add(deadline)) 7999 } 8000 8001 // We can add to our stream while snapshotting but not "user" delete anything. 8002 var state StreamState 8003 fs.FastState(&state) 8004 8005 // Stream in separate Go routine. 8006 go fs.streamSnapshot(pw, &state, includeConsumers) 8007 8008 return &SnapshotResult{pr, state}, nil 8009 } 8010 8011 // Helper to return the config. 8012 func (fs *fileStore) fileStoreConfig() FileStoreConfig { 8013 fs.mu.RLock() 8014 defer fs.mu.RUnlock() 8015 return fs.fcfg 8016 } 8017 8018 // Read lock all existing message blocks. 8019 // Lock held on entry. 8020 func (fs *fileStore) readLockAllMsgBlocks() { 8021 for _, mb := range fs.blks { 8022 mb.mu.RLock() 8023 } 8024 } 8025 8026 // Read unlock all existing message blocks. 8027 // Lock held on entry. 8028 func (fs *fileStore) readUnlockAllMsgBlocks() { 8029 for _, mb := range fs.blks { 8030 mb.mu.RUnlock() 8031 } 8032 } 8033 8034 // Binary encoded state snapshot, >= v2.10 server. 8035 func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) { 8036 fs.mu.RLock() 8037 defer fs.mu.RUnlock() 8038 8039 // Calculate deleted. 8040 var numDeleted int64 8041 if fs.state.LastSeq > fs.state.FirstSeq { 8042 numDeleted = int64(fs.state.LastSeq-fs.state.FirstSeq+1) - int64(fs.state.Msgs) 8043 if numDeleted < 0 { 8044 numDeleted = 0 8045 } 8046 } 8047 8048 // Encoded is Msgs, Bytes, FirstSeq, LastSeq, Failed, NumDeleted and optional DeletedBlocks 8049 var buf [1024]byte 8050 buf[0], buf[1] = streamStateMagic, streamStateVersion 8051 n := hdrLen 8052 n += binary.PutUvarint(buf[n:], fs.state.Msgs) 8053 n += binary.PutUvarint(buf[n:], fs.state.Bytes) 8054 n += binary.PutUvarint(buf[n:], fs.state.FirstSeq) 8055 n += binary.PutUvarint(buf[n:], fs.state.LastSeq) 8056 n += binary.PutUvarint(buf[n:], failed) 8057 n += binary.PutUvarint(buf[n:], uint64(numDeleted)) 8058 8059 b := buf[0:n] 8060 8061 if numDeleted > 0 { 8062 var scratch [4 * 1024]byte 8063 8064 fs.readLockAllMsgBlocks() 8065 defer fs.readUnlockAllMsgBlocks() 8066 8067 for _, db := range fs.deleteBlocks() { 8068 switch db := db.(type) { 8069 case *DeleteRange: 8070 first, _, num := db.State() 8071 scratch[0] = runLengthMagic 8072 i := 1 8073 i += binary.PutUvarint(scratch[i:], first) 8074 i += binary.PutUvarint(scratch[i:], num) 8075 b = append(b, scratch[0:i]...) 8076 case *avl.SequenceSet: 8077 buf, err := db.Encode(scratch[:0]) 8078 if err != nil { 8079 return nil, err 8080 } 8081 b = append(b, buf...) 8082 default: 8083 return nil, errors.New("no impl") 8084 } 8085 } 8086 } 8087 8088 return b, nil 8089 } 8090 8091 // We used to be more sophisticated to save memory, but speed is more important. 8092 // All blocks should be at least read locked. 8093 func (fs *fileStore) deleteBlocks() DeleteBlocks { 8094 var dbs DeleteBlocks 8095 var prevLast uint64 8096 8097 for _, mb := range fs.blks { 8098 // Detect if we have a gap between these blocks. 8099 fseq := atomic.LoadUint64(&mb.first.seq) 8100 if prevLast > 0 && prevLast+1 != fseq { 8101 dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: fseq - prevLast - 1}) 8102 } 8103 if mb.dmap.Size() > 0 { 8104 dbs = append(dbs, &mb.dmap) 8105 } 8106 prevLast = atomic.LoadUint64(&mb.last.seq) 8107 } 8108 return dbs 8109 } 8110 8111 // SyncDeleted will make sure this stream has same deleted state as dbs. 8112 func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) { 8113 if len(dbs) == 0 { 8114 return 8115 } 8116 8117 fs.mu.Lock() 8118 defer fs.mu.Unlock() 8119 8120 var needsCheck DeleteBlocks 8121 8122 fs.readLockAllMsgBlocks() 8123 mdbs := fs.deleteBlocks() 8124 for i, db := range dbs { 8125 // If the block is same as what we have we can skip. 8126 if i < len(mdbs) { 8127 first, last, num := db.State() 8128 eFirst, eLast, eNum := mdbs[i].State() 8129 if first == eFirst && last == eLast && num == eNum { 8130 continue 8131 } 8132 } 8133 // Need to insert these. 8134 needsCheck = append(needsCheck, db) 8135 } 8136 fs.readUnlockAllMsgBlocks() 8137 8138 for _, db := range needsCheck { 8139 db.Range(func(dseq uint64) bool { 8140 fs.removeMsg(dseq, false, true, false) 8141 return true 8142 }) 8143 } 8144 } 8145 8146 //////////////////////////////////////////////////////////////////////////////// 8147 // Consumers 8148 //////////////////////////////////////////////////////////////////////////////// 8149 8150 type consumerFileStore struct { 8151 mu sync.Mutex 8152 fs *fileStore 8153 cfg *FileConsumerInfo 8154 prf keyGen 8155 aek cipher.AEAD 8156 name string 8157 odir string 8158 ifn string 8159 hh hash.Hash64 8160 state ConsumerState 8161 fch chan struct{} 8162 qch chan struct{} 8163 flusher bool 8164 writing bool 8165 dirty bool 8166 closed bool 8167 } 8168 8169 func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) { 8170 if fs == nil { 8171 return nil, fmt.Errorf("filestore is nil") 8172 } 8173 if fs.isClosed() { 8174 return nil, ErrStoreClosed 8175 } 8176 if cfg == nil || name == _EMPTY_ { 8177 return nil, fmt.Errorf("bad consumer config") 8178 } 8179 8180 // We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store. 8181 if cfg.MemoryStorage { 8182 // Create directly here. 8183 o := &consumerMemStore{ms: fs, cfg: *cfg} 8184 fs.AddConsumer(o) 8185 return o, nil 8186 } 8187 8188 odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name) 8189 if err := os.MkdirAll(odir, defaultDirPerms); err != nil { 8190 return nil, fmt.Errorf("could not create consumer directory - %v", err) 8191 } 8192 csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg} 8193 o := &consumerFileStore{ 8194 fs: fs, 8195 cfg: csi, 8196 prf: fs.prf, 8197 name: name, 8198 odir: odir, 8199 ifn: filepath.Join(odir, consumerState), 8200 } 8201 key := sha256.Sum256([]byte(fs.cfg.Name + "/" + name)) 8202 hh, err := highwayhash.New64(key[:]) 8203 if err != nil { 8204 return nil, fmt.Errorf("could not create hash: %v", err) 8205 } 8206 o.hh = hh 8207 8208 // Check for encryption. 8209 if o.prf != nil { 8210 if ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)); err == nil { 8211 if len(ekey) < minBlkKeySize { 8212 return nil, errBadKeySize 8213 } 8214 // Recover key encryption key. 8215 rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name)) 8216 if err != nil { 8217 return nil, err 8218 } 8219 8220 sc := fs.fcfg.Cipher 8221 kek, err := genEncryptionKey(sc, rb) 8222 if err != nil { 8223 return nil, err 8224 } 8225 ns := kek.NonceSize() 8226 nonce := ekey[:ns] 8227 seed, err := kek.Open(nil, nonce, ekey[ns:], nil) 8228 if err != nil { 8229 // We may be here on a cipher conversion, so attempt to convert. 8230 if err = o.convertCipher(); err != nil { 8231 return nil, err 8232 } 8233 } else { 8234 o.aek, err = genEncryptionKey(sc, seed) 8235 } 8236 if err != nil { 8237 return nil, err 8238 } 8239 } 8240 } 8241 8242 // Track if we are creating the directory so that we can clean up if we encounter an error. 8243 var didCreate bool 8244 8245 // Write our meta data iff does not exist. 8246 meta := filepath.Join(odir, JetStreamMetaFile) 8247 if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) { 8248 didCreate = true 8249 csi.Created = time.Now().UTC() 8250 if err := o.writeConsumerMeta(); err != nil { 8251 os.RemoveAll(odir) 8252 return nil, err 8253 } 8254 } 8255 8256 // If we expect to be encrypted check that what we are restoring is not plaintext. 8257 // This can happen on snapshot restores or conversions. 8258 if o.prf != nil { 8259 keyFile := filepath.Join(odir, JetStreamMetaFileKey) 8260 if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) { 8261 if err := o.writeConsumerMeta(); err != nil { 8262 if didCreate { 8263 os.RemoveAll(odir) 8264 } 8265 return nil, err 8266 } 8267 // Redo the state file as well here if we have one and we can tell it was plaintext. 8268 if buf, err := os.ReadFile(o.ifn); err == nil { 8269 if _, err := decodeConsumerState(buf); err == nil { 8270 <-dios 8271 err := os.WriteFile(o.ifn, o.encryptState(buf), defaultFilePerms) 8272 dios <- struct{}{} 8273 if err != nil { 8274 if didCreate { 8275 os.RemoveAll(odir) 8276 } 8277 return nil, err 8278 } 8279 } 8280 } 8281 } 8282 } 8283 8284 // Create channels to control our flush go routine. 8285 o.fch = make(chan struct{}, 1) 8286 o.qch = make(chan struct{}) 8287 go o.flushLoop(o.fch, o.qch) 8288 8289 // Make sure to load in our state from disk if needed. 8290 o.loadState() 8291 8292 // Assign to filestore. 8293 fs.AddConsumer(o) 8294 8295 return o, nil 8296 } 8297 8298 func (o *consumerFileStore) convertCipher() error { 8299 fs := o.fs 8300 odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, o.name) 8301 8302 ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)) 8303 if err != nil { 8304 return err 8305 } 8306 if len(ekey) < minBlkKeySize { 8307 return errBadKeySize 8308 } 8309 // Recover key encryption key. 8310 rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name)) 8311 if err != nil { 8312 return err 8313 } 8314 8315 // Do these in reverse since converting. 8316 sc := fs.fcfg.Cipher 8317 osc := AES 8318 if sc == AES { 8319 osc = ChaCha 8320 } 8321 kek, err := genEncryptionKey(osc, rb) 8322 if err != nil { 8323 return err 8324 } 8325 ns := kek.NonceSize() 8326 nonce := ekey[:ns] 8327 seed, err := kek.Open(nil, nonce, ekey[ns:], nil) 8328 if err != nil { 8329 return err 8330 } 8331 aek, err := genEncryptionKey(osc, seed) 8332 if err != nil { 8333 return err 8334 } 8335 // Now read in and decode our state using the old cipher. 8336 buf, err := os.ReadFile(o.ifn) 8337 if err != nil { 8338 return err 8339 } 8340 buf, err = aek.Open(nil, buf[:ns], buf[ns:], nil) 8341 if err != nil { 8342 return err 8343 } 8344 8345 // Since we are here we recovered our old state. 8346 // Now write our meta, which will generate the new keys with the new cipher. 8347 if err := o.writeConsumerMeta(); err != nil { 8348 return err 8349 } 8350 8351 // Now write out or state with the new cipher. 8352 return o.writeState(buf) 8353 } 8354 8355 // Kick flusher for this consumer. 8356 // Lock should be held. 8357 func (o *consumerFileStore) kickFlusher() { 8358 if o.fch != nil { 8359 select { 8360 case o.fch <- struct{}{}: 8361 default: 8362 } 8363 } 8364 o.dirty = true 8365 } 8366 8367 // Set in flusher status 8368 func (o *consumerFileStore) setInFlusher() { 8369 o.mu.Lock() 8370 o.flusher = true 8371 o.mu.Unlock() 8372 } 8373 8374 // Clear in flusher status 8375 func (o *consumerFileStore) clearInFlusher() { 8376 o.mu.Lock() 8377 o.flusher = false 8378 o.mu.Unlock() 8379 } 8380 8381 // Report in flusher status 8382 func (o *consumerFileStore) inFlusher() bool { 8383 o.mu.Lock() 8384 defer o.mu.Unlock() 8385 return o.flusher 8386 } 8387 8388 // flushLoop watches for consumer updates and the quit channel. 8389 func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) { 8390 8391 o.setInFlusher() 8392 defer o.clearInFlusher() 8393 8394 // Maintain approximately 10 updates per second per consumer under load. 8395 const minTime = 100 * time.Millisecond 8396 var lastWrite time.Time 8397 var dt *time.Timer 8398 8399 setDelayTimer := func(addWait time.Duration) { 8400 if dt == nil { 8401 dt = time.NewTimer(addWait) 8402 return 8403 } 8404 if !dt.Stop() { 8405 select { 8406 case <-dt.C: 8407 default: 8408 } 8409 } 8410 dt.Reset(addWait) 8411 } 8412 8413 for { 8414 select { 8415 case <-fch: 8416 if ts := time.Since(lastWrite); ts < minTime { 8417 setDelayTimer(minTime - ts) 8418 select { 8419 case <-dt.C: 8420 case <-qch: 8421 return 8422 } 8423 } 8424 o.mu.Lock() 8425 if o.closed { 8426 o.mu.Unlock() 8427 return 8428 } 8429 buf, err := o.encodeState() 8430 o.mu.Unlock() 8431 if err != nil { 8432 return 8433 } 8434 // TODO(dlc) - if we error should start failing upwards. 8435 if err := o.writeState(buf); err == nil { 8436 lastWrite = time.Now() 8437 } 8438 case <-qch: 8439 return 8440 } 8441 } 8442 } 8443 8444 // SetStarting sets our starting stream sequence. 8445 func (o *consumerFileStore) SetStarting(sseq uint64) error { 8446 o.mu.Lock() 8447 o.state.Delivered.Stream = sseq 8448 buf, err := o.encodeState() 8449 o.mu.Unlock() 8450 if err != nil { 8451 return err 8452 } 8453 return o.writeState(buf) 8454 } 8455 8456 // HasState returns if this store has a recorded state. 8457 func (o *consumerFileStore) HasState() bool { 8458 o.mu.Lock() 8459 _, err := os.Stat(o.ifn) 8460 o.mu.Unlock() 8461 return err == nil 8462 } 8463 8464 // UpdateDelivered is called whenever a new message has been delivered. 8465 func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error { 8466 o.mu.Lock() 8467 defer o.mu.Unlock() 8468 8469 if dc != 1 && o.cfg.AckPolicy == AckNone { 8470 return ErrNoAckPolicy 8471 } 8472 8473 // On restarts the old leader may get a replay from the raft logs that are old. 8474 if dseq <= o.state.AckFloor.Consumer { 8475 return nil 8476 } 8477 8478 // See if we expect an ack for this. 8479 if o.cfg.AckPolicy != AckNone { 8480 // Need to create pending records here. 8481 if o.state.Pending == nil { 8482 o.state.Pending = make(map[uint64]*Pending) 8483 } 8484 var p *Pending 8485 // Check for an update to a message already delivered. 8486 if sseq <= o.state.Delivered.Stream { 8487 if p = o.state.Pending[sseq]; p != nil { 8488 p.Sequence, p.Timestamp = dseq, ts 8489 } 8490 } else { 8491 // Add to pending. 8492 o.state.Pending[sseq] = &Pending{dseq, ts} 8493 } 8494 // Update delivered as needed. 8495 if dseq > o.state.Delivered.Consumer { 8496 o.state.Delivered.Consumer = dseq 8497 } 8498 if sseq > o.state.Delivered.Stream { 8499 o.state.Delivered.Stream = sseq 8500 } 8501 8502 if dc > 1 { 8503 if maxdc := uint64(o.cfg.MaxDeliver); maxdc > 0 && dc > maxdc { 8504 // Make sure to remove from pending. 8505 delete(o.state.Pending, sseq) 8506 } 8507 if o.state.Redelivered == nil { 8508 o.state.Redelivered = make(map[uint64]uint64) 8509 } 8510 // Only update if greater then what we already have. 8511 if o.state.Redelivered[sseq] < dc-1 { 8512 o.state.Redelivered[sseq] = dc - 1 8513 } 8514 } 8515 } else { 8516 // For AckNone just update delivered and ackfloor at the same time. 8517 if dseq > o.state.Delivered.Consumer { 8518 o.state.Delivered.Consumer = dseq 8519 o.state.AckFloor.Consumer = dseq 8520 } 8521 if sseq > o.state.Delivered.Stream { 8522 o.state.Delivered.Stream = sseq 8523 o.state.AckFloor.Stream = sseq 8524 } 8525 } 8526 // Make sure we flush to disk. 8527 o.kickFlusher() 8528 8529 return nil 8530 } 8531 8532 // UpdateAcks is called whenever a consumer with explicit ack or ack all acks a message. 8533 func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error { 8534 o.mu.Lock() 8535 defer o.mu.Unlock() 8536 8537 if o.cfg.AckPolicy == AckNone { 8538 return ErrNoAckPolicy 8539 } 8540 8541 // On restarts the old leader may get a replay from the raft logs that are old. 8542 if dseq <= o.state.AckFloor.Consumer { 8543 return nil 8544 } 8545 8546 if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil { 8547 return ErrStoreMsgNotFound 8548 } 8549 8550 // Check for AckAll here. 8551 if o.cfg.AckPolicy == AckAll { 8552 sgap := sseq - o.state.AckFloor.Stream 8553 o.state.AckFloor.Consumer = dseq 8554 o.state.AckFloor.Stream = sseq 8555 for seq := sseq; seq > sseq-sgap; seq-- { 8556 delete(o.state.Pending, seq) 8557 if len(o.state.Redelivered) > 0 { 8558 delete(o.state.Redelivered, seq) 8559 } 8560 } 8561 o.kickFlusher() 8562 return nil 8563 } 8564 8565 // AckExplicit 8566 8567 // First delete from our pending state. 8568 if p, ok := o.state.Pending[sseq]; ok { 8569 delete(o.state.Pending, sseq) 8570 dseq = p.Sequence // Use the original. 8571 } 8572 if len(o.state.Pending) == 0 { 8573 o.state.AckFloor.Consumer = o.state.Delivered.Consumer 8574 o.state.AckFloor.Stream = o.state.Delivered.Stream 8575 } else if dseq == o.state.AckFloor.Consumer+1 { 8576 o.state.AckFloor.Consumer = dseq 8577 o.state.AckFloor.Stream = sseq 8578 8579 if o.state.Delivered.Consumer > dseq { 8580 for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ { 8581 if p, ok := o.state.Pending[ss]; ok { 8582 if p.Sequence > 0 { 8583 o.state.AckFloor.Consumer = p.Sequence - 1 8584 o.state.AckFloor.Stream = ss - 1 8585 } 8586 break 8587 } 8588 } 8589 } 8590 } 8591 // We do these regardless. 8592 delete(o.state.Redelivered, sseq) 8593 8594 o.kickFlusher() 8595 return nil 8596 } 8597 8598 const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen 8599 8600 // Encode our consumer state, version 2. 8601 // Lock should be held. 8602 8603 func (o *consumerFileStore) EncodedState() ([]byte, error) { 8604 o.mu.Lock() 8605 defer o.mu.Unlock() 8606 return o.encodeState() 8607 } 8608 8609 func (o *consumerFileStore) encodeState() ([]byte, error) { 8610 // Grab reference to state, but make sure we load in if needed, so do not reference o.state directly. 8611 state, err := o.stateWithCopyLocked(false) 8612 if err != nil { 8613 return nil, err 8614 } 8615 return encodeConsumerState(state), nil 8616 } 8617 8618 func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error { 8619 o.mu.Lock() 8620 defer o.mu.Unlock() 8621 8622 // This is mostly unchecked here. We are assuming the upper layers have done sanity checking. 8623 csi := o.cfg 8624 csi.ConsumerConfig = *cfg 8625 8626 return o.writeConsumerMeta() 8627 } 8628 8629 func (o *consumerFileStore) Update(state *ConsumerState) error { 8630 o.mu.Lock() 8631 defer o.mu.Unlock() 8632 8633 // Check to see if this is an outdated update. 8634 if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream { 8635 return nil 8636 } 8637 8638 // Sanity checks. 8639 if state.AckFloor.Consumer > state.Delivered.Consumer { 8640 return fmt.Errorf("bad ack floor for consumer") 8641 } 8642 if state.AckFloor.Stream > state.Delivered.Stream { 8643 return fmt.Errorf("bad ack floor for stream") 8644 } 8645 8646 // Copy to our state. 8647 var pending map[uint64]*Pending 8648 var redelivered map[uint64]uint64 8649 if len(state.Pending) > 0 { 8650 pending = make(map[uint64]*Pending, len(state.Pending)) 8651 for seq, p := range state.Pending { 8652 pending[seq] = &Pending{p.Sequence, p.Timestamp} 8653 if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream { 8654 return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq) 8655 } 8656 } 8657 } 8658 if len(state.Redelivered) > 0 { 8659 redelivered = make(map[uint64]uint64, len(state.Redelivered)) 8660 for seq, dc := range state.Redelivered { 8661 redelivered[seq] = dc 8662 } 8663 } 8664 8665 o.state.Delivered = state.Delivered 8666 o.state.AckFloor = state.AckFloor 8667 o.state.Pending = pending 8668 o.state.Redelivered = redelivered 8669 8670 o.kickFlusher() 8671 8672 return nil 8673 } 8674 8675 // Will encrypt the state with our asset key. Will be a no-op if encryption not enabled. 8676 // Lock should be held. 8677 func (o *consumerFileStore) encryptState(buf []byte) []byte { 8678 if o.aek == nil { 8679 return buf 8680 } 8681 // TODO(dlc) - Optimize on space usage a bit? 8682 nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead()) 8683 rand.Read(nonce) 8684 return o.aek.Seal(nonce, nonce, buf, nil) 8685 } 8686 8687 // Used to limit number of disk IO calls in flight since they could all be blocking an OS thread. 8688 // https://github.com/nats-io/nats-server/issues/2742 8689 var dios chan struct{} 8690 8691 // Used to setup our simplistic counting semaphore using buffered channels. 8692 // golang.org's semaphore seemed a bit heavy. 8693 func init() { 8694 // Limit ourselves to a max of 4 blocking IO calls. 8695 const nIO = 4 8696 dios = make(chan struct{}, nIO) 8697 // Fill it up to start. 8698 for i := 0; i < nIO; i++ { 8699 dios <- struct{}{} 8700 } 8701 } 8702 8703 func (o *consumerFileStore) writeState(buf []byte) error { 8704 // Check if we have the index file open. 8705 o.mu.Lock() 8706 if o.writing || len(buf) == 0 { 8707 o.mu.Unlock() 8708 return nil 8709 } 8710 8711 // Check on encryption. 8712 if o.aek != nil { 8713 buf = o.encryptState(buf) 8714 } 8715 8716 o.writing = true 8717 o.dirty = false 8718 ifn := o.ifn 8719 o.mu.Unlock() 8720 8721 // Lock not held here but we do limit number of outstanding calls that could block OS threads. 8722 <-dios 8723 err := os.WriteFile(ifn, buf, defaultFilePerms) 8724 dios <- struct{}{} 8725 8726 o.mu.Lock() 8727 if err != nil { 8728 o.dirty = true 8729 } 8730 o.writing = false 8731 o.mu.Unlock() 8732 8733 return err 8734 } 8735 8736 // Will upodate the config. Only used when recovering ephemerals. 8737 func (o *consumerFileStore) updateConfig(cfg ConsumerConfig) error { 8738 o.mu.Lock() 8739 defer o.mu.Unlock() 8740 o.cfg = &FileConsumerInfo{ConsumerConfig: cfg} 8741 return o.writeConsumerMeta() 8742 } 8743 8744 // Write out the consumer meta data, i.e. state. 8745 // Lock should be held. 8746 func (cfs *consumerFileStore) writeConsumerMeta() error { 8747 meta := filepath.Join(cfs.odir, JetStreamMetaFile) 8748 if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) { 8749 return err 8750 } 8751 8752 if cfs.prf != nil && cfs.aek == nil { 8753 fs := cfs.fs 8754 key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name + tsep + cfs.name) 8755 if err != nil { 8756 return err 8757 } 8758 cfs.aek = key 8759 keyFile := filepath.Join(cfs.odir, JetStreamMetaFileKey) 8760 if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) { 8761 return err 8762 } 8763 <-dios 8764 err = os.WriteFile(keyFile, encrypted, defaultFilePerms) 8765 dios <- struct{}{} 8766 if err != nil { 8767 return err 8768 } 8769 } 8770 8771 b, err := json.Marshal(cfs.cfg) 8772 if err != nil { 8773 return err 8774 } 8775 // Encrypt if needed. 8776 if cfs.aek != nil { 8777 nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead()) 8778 rand.Read(nonce) 8779 b = cfs.aek.Seal(nonce, nonce, b, nil) 8780 } 8781 8782 <-dios 8783 err = os.WriteFile(meta, b, defaultFilePerms) 8784 dios <- struct{}{} 8785 if err != nil { 8786 return err 8787 } 8788 cfs.hh.Reset() 8789 cfs.hh.Write(b) 8790 checksum := hex.EncodeToString(cfs.hh.Sum(nil)) 8791 sum := filepath.Join(cfs.odir, JetStreamMetaFileSum) 8792 8793 <-dios 8794 err = os.WriteFile(sum, []byte(checksum), defaultFilePerms) 8795 dios <- struct{}{} 8796 if err != nil { 8797 return err 8798 } 8799 return nil 8800 } 8801 8802 // Consumer version. 8803 func checkConsumerHeader(hdr []byte) (uint8, error) { 8804 if hdr == nil || len(hdr) < 2 || hdr[0] != magic { 8805 return 0, errCorruptState 8806 } 8807 version := hdr[1] 8808 switch version { 8809 case 1, 2: 8810 return version, nil 8811 } 8812 return 0, fmt.Errorf("unsupported version: %d", version) 8813 } 8814 8815 func (o *consumerFileStore) copyPending() map[uint64]*Pending { 8816 pending := make(map[uint64]*Pending, len(o.state.Pending)) 8817 for seq, p := range o.state.Pending { 8818 pending[seq] = &Pending{p.Sequence, p.Timestamp} 8819 } 8820 return pending 8821 } 8822 8823 func (o *consumerFileStore) copyRedelivered() map[uint64]uint64 { 8824 redelivered := make(map[uint64]uint64, len(o.state.Redelivered)) 8825 for seq, dc := range o.state.Redelivered { 8826 redelivered[seq] = dc 8827 } 8828 return redelivered 8829 } 8830 8831 // Type returns the type of the underlying store. 8832 func (o *consumerFileStore) Type() StorageType { return FileStorage } 8833 8834 // State retrieves the state from the state file. 8835 // This is not expected to be called in high performance code, only on startup. 8836 func (o *consumerFileStore) State() (*ConsumerState, error) { 8837 return o.stateWithCopy(true) 8838 } 8839 8840 // This will not copy pending or redelivered, so should only be done under the 8841 // consumer owner's lock. 8842 func (o *consumerFileStore) BorrowState() (*ConsumerState, error) { 8843 return o.stateWithCopy(false) 8844 } 8845 8846 func (o *consumerFileStore) stateWithCopy(doCopy bool) (*ConsumerState, error) { 8847 o.mu.Lock() 8848 defer o.mu.Unlock() 8849 return o.stateWithCopyLocked(doCopy) 8850 } 8851 8852 // Lock should be held. 8853 func (o *consumerFileStore) stateWithCopyLocked(doCopy bool) (*ConsumerState, error) { 8854 if o.closed { 8855 return nil, ErrStoreClosed 8856 } 8857 8858 state := &ConsumerState{} 8859 8860 // See if we have a running state or if we need to read in from disk. 8861 if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 { 8862 state.Delivered = o.state.Delivered 8863 state.AckFloor = o.state.AckFloor 8864 if len(o.state.Pending) > 0 { 8865 if doCopy { 8866 state.Pending = o.copyPending() 8867 } else { 8868 state.Pending = o.state.Pending 8869 } 8870 } 8871 if len(o.state.Redelivered) > 0 { 8872 if doCopy { 8873 state.Redelivered = o.copyRedelivered() 8874 } else { 8875 state.Redelivered = o.state.Redelivered 8876 } 8877 } 8878 return state, nil 8879 } 8880 8881 // Read the state in here from disk.. 8882 <-dios 8883 buf, err := os.ReadFile(o.ifn) 8884 dios <- struct{}{} 8885 8886 if err != nil && !os.IsNotExist(err) { 8887 return nil, err 8888 } 8889 8890 if len(buf) == 0 { 8891 return state, nil 8892 } 8893 8894 // Check on encryption. 8895 if o.aek != nil { 8896 ns := o.aek.NonceSize() 8897 buf, err = o.aek.Open(nil, buf[:ns], buf[ns:], nil) 8898 if err != nil { 8899 return nil, err 8900 } 8901 } 8902 8903 state, err = decodeConsumerState(buf) 8904 if err != nil { 8905 return nil, err 8906 } 8907 8908 // Copy this state into our own. 8909 o.state.Delivered = state.Delivered 8910 o.state.AckFloor = state.AckFloor 8911 if len(state.Pending) > 0 { 8912 if doCopy { 8913 o.state.Pending = make(map[uint64]*Pending, len(state.Pending)) 8914 for seq, p := range state.Pending { 8915 o.state.Pending[seq] = &Pending{p.Sequence, p.Timestamp} 8916 } 8917 } else { 8918 o.state.Pending = state.Pending 8919 } 8920 } 8921 if len(state.Redelivered) > 0 { 8922 if doCopy { 8923 o.state.Redelivered = make(map[uint64]uint64, len(state.Redelivered)) 8924 for seq, dc := range state.Redelivered { 8925 o.state.Redelivered[seq] = dc 8926 } 8927 } else { 8928 o.state.Redelivered = state.Redelivered 8929 } 8930 } 8931 8932 return state, nil 8933 } 8934 8935 // Lock should be held. Called at startup. 8936 func (o *consumerFileStore) loadState() { 8937 if _, err := os.Stat(o.ifn); err == nil { 8938 // This will load our state in from disk. 8939 o.stateWithCopyLocked(false) 8940 } 8941 } 8942 8943 // Decode consumer state. 8944 func decodeConsumerState(buf []byte) (*ConsumerState, error) { 8945 version, err := checkConsumerHeader(buf) 8946 if err != nil { 8947 return nil, err 8948 } 8949 8950 bi := hdrLen 8951 // Helpers, will set i to -1 on error. 8952 readSeq := func() uint64 { 8953 if bi < 0 { 8954 return 0 8955 } 8956 seq, n := binary.Uvarint(buf[bi:]) 8957 if n <= 0 { 8958 bi = -1 8959 return 0 8960 } 8961 bi += n 8962 return seq 8963 } 8964 readTimeStamp := func() int64 { 8965 if bi < 0 { 8966 return 0 8967 } 8968 ts, n := binary.Varint(buf[bi:]) 8969 if n <= 0 { 8970 bi = -1 8971 return -1 8972 } 8973 bi += n 8974 return ts 8975 } 8976 // Just for clarity below. 8977 readLen := readSeq 8978 readCount := readSeq 8979 8980 state := &ConsumerState{} 8981 state.AckFloor.Consumer = readSeq() 8982 state.AckFloor.Stream = readSeq() 8983 state.Delivered.Consumer = readSeq() 8984 state.Delivered.Stream = readSeq() 8985 8986 if bi == -1 { 8987 return nil, errCorruptState 8988 } 8989 if version == 1 { 8990 // Adjust back. Version 1 also stored delivered as next to be delivered, 8991 // so adjust that back down here. 8992 if state.AckFloor.Consumer > 1 { 8993 state.Delivered.Consumer += state.AckFloor.Consumer - 1 8994 } 8995 if state.AckFloor.Stream > 1 { 8996 state.Delivered.Stream += state.AckFloor.Stream - 1 8997 } 8998 } 8999 9000 // Protect ourselves against rolling backwards. 9001 const hbit = 1 << 63 9002 if state.AckFloor.Stream&hbit != 0 || state.Delivered.Stream&hbit != 0 { 9003 return nil, errCorruptState 9004 } 9005 9006 // We have additional stuff. 9007 if numPending := readLen(); numPending > 0 { 9008 mints := readTimeStamp() 9009 state.Pending = make(map[uint64]*Pending, numPending) 9010 for i := 0; i < int(numPending); i++ { 9011 sseq := readSeq() 9012 var dseq uint64 9013 if version == 2 { 9014 dseq = readSeq() 9015 } 9016 ts := readTimeStamp() 9017 // Check the state machine for corruption, not the value which could be -1. 9018 if bi == -1 { 9019 return nil, errCorruptState 9020 } 9021 // Adjust seq back. 9022 sseq += state.AckFloor.Stream 9023 if sseq == 0 { 9024 return nil, errCorruptState 9025 } 9026 if version == 2 { 9027 dseq += state.AckFloor.Consumer 9028 } 9029 // Adjust the timestamp back. 9030 if version == 1 { 9031 ts = (ts + mints) * int64(time.Second) 9032 } else { 9033 ts = (mints - ts) * int64(time.Second) 9034 } 9035 // Store in pending. 9036 state.Pending[sseq] = &Pending{dseq, ts} 9037 } 9038 } 9039 9040 // We have redelivered entries here. 9041 if numRedelivered := readLen(); numRedelivered > 0 { 9042 state.Redelivered = make(map[uint64]uint64, numRedelivered) 9043 for i := 0; i < int(numRedelivered); i++ { 9044 if seq, n := readSeq(), readCount(); seq > 0 && n > 0 { 9045 // Adjust seq back. 9046 seq += state.AckFloor.Stream 9047 state.Redelivered[seq] = n 9048 } 9049 } 9050 } 9051 9052 return state, nil 9053 } 9054 9055 // Stop the processing of the consumers's state. 9056 func (o *consumerFileStore) Stop() error { 9057 o.mu.Lock() 9058 if o.closed { 9059 o.mu.Unlock() 9060 return nil 9061 } 9062 if o.qch != nil { 9063 close(o.qch) 9064 o.qch = nil 9065 } 9066 9067 var err error 9068 var buf []byte 9069 9070 if o.dirty { 9071 // Make sure to write this out.. 9072 if buf, err = o.encodeState(); err == nil && len(buf) > 0 { 9073 if o.aek != nil { 9074 buf = o.encryptState(buf) 9075 } 9076 } 9077 } 9078 9079 o.odir = _EMPTY_ 9080 o.closed = true 9081 ifn, fs := o.ifn, o.fs 9082 o.mu.Unlock() 9083 9084 fs.RemoveConsumer(o) 9085 9086 if len(buf) > 0 { 9087 o.waitOnFlusher() 9088 <-dios 9089 err = os.WriteFile(ifn, buf, defaultFilePerms) 9090 dios <- struct{}{} 9091 } 9092 return err 9093 } 9094 9095 func (o *consumerFileStore) waitOnFlusher() { 9096 if !o.inFlusher() { 9097 return 9098 } 9099 9100 timeout := time.Now().Add(100 * time.Millisecond) 9101 for time.Now().Before(timeout) { 9102 if !o.inFlusher() { 9103 return 9104 } 9105 time.Sleep(10 * time.Millisecond) 9106 } 9107 } 9108 9109 // Delete the consumer. 9110 func (o *consumerFileStore) Delete() error { 9111 return o.delete(false) 9112 } 9113 9114 func (o *consumerFileStore) StreamDelete() error { 9115 return o.delete(true) 9116 } 9117 9118 func (o *consumerFileStore) delete(streamDeleted bool) error { 9119 o.mu.Lock() 9120 if o.closed { 9121 o.mu.Unlock() 9122 return nil 9123 } 9124 if o.qch != nil { 9125 close(o.qch) 9126 o.qch = nil 9127 } 9128 9129 var err error 9130 odir := o.odir 9131 o.odir = _EMPTY_ 9132 o.closed = true 9133 fs := o.fs 9134 o.mu.Unlock() 9135 9136 // If our stream was not deleted this will remove the directories. 9137 if odir != _EMPTY_ && !streamDeleted { 9138 <-dios 9139 err = os.RemoveAll(odir) 9140 dios <- struct{}{} 9141 } 9142 9143 if !streamDeleted { 9144 fs.RemoveConsumer(o) 9145 } 9146 9147 return err 9148 } 9149 9150 func (fs *fileStore) AddConsumer(o ConsumerStore) error { 9151 fs.cmu.Lock() 9152 defer fs.cmu.Unlock() 9153 fs.cfs = append(fs.cfs, o) 9154 return nil 9155 } 9156 9157 func (fs *fileStore) RemoveConsumer(o ConsumerStore) error { 9158 fs.cmu.Lock() 9159 defer fs.cmu.Unlock() 9160 for i, cfs := range fs.cfs { 9161 if o == cfs { 9162 fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...) 9163 break 9164 } 9165 } 9166 return nil 9167 } 9168 9169 //////////////////////////////////////////////////////////////////////////////// 9170 // Templates 9171 //////////////////////////////////////////////////////////////////////////////// 9172 9173 type templateFileStore struct { 9174 dir string 9175 hh hash.Hash64 9176 } 9177 9178 func newTemplateFileStore(storeDir string) *templateFileStore { 9179 tdir := filepath.Join(storeDir, tmplsDir) 9180 key := sha256.Sum256([]byte("templates")) 9181 hh, err := highwayhash.New64(key[:]) 9182 if err != nil { 9183 return nil 9184 } 9185 return &templateFileStore{dir: tdir, hh: hh} 9186 } 9187 9188 func (ts *templateFileStore) Store(t *streamTemplate) error { 9189 dir := filepath.Join(ts.dir, t.Name) 9190 if err := os.MkdirAll(dir, defaultDirPerms); err != nil { 9191 return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err) 9192 } 9193 meta := filepath.Join(dir, JetStreamMetaFile) 9194 if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil { 9195 return err 9196 } 9197 t.mu.Lock() 9198 b, err := json.Marshal(t) 9199 t.mu.Unlock() 9200 if err != nil { 9201 return err 9202 } 9203 if err := os.WriteFile(meta, b, defaultFilePerms); err != nil { 9204 return err 9205 } 9206 // FIXME(dlc) - Do checksum 9207 ts.hh.Reset() 9208 ts.hh.Write(b) 9209 checksum := hex.EncodeToString(ts.hh.Sum(nil)) 9210 sum := filepath.Join(dir, JetStreamMetaFileSum) 9211 if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil { 9212 return err 9213 } 9214 return nil 9215 } 9216 9217 func (ts *templateFileStore) Delete(t *streamTemplate) error { 9218 return os.RemoveAll(filepath.Join(ts.dir, t.Name)) 9219 } 9220 9221 //////////////////////////////////////////////////////////////////////////////// 9222 // Compression 9223 //////////////////////////////////////////////////////////////////////////////// 9224 9225 type CompressionInfo struct { 9226 Algorithm StoreCompression 9227 OriginalSize uint64 9228 } 9229 9230 func (c *CompressionInfo) MarshalMetadata() []byte { 9231 b := make([]byte, 14) // 4 + potentially up to 10 for uint64 9232 b[0], b[1], b[2] = 'c', 'm', 'p' 9233 b[3] = byte(c.Algorithm) 9234 n := binary.PutUvarint(b[4:], c.OriginalSize) 9235 return b[:4+n] 9236 } 9237 9238 func (c *CompressionInfo) UnmarshalMetadata(b []byte) (int, error) { 9239 c.Algorithm = NoCompression 9240 c.OriginalSize = 0 9241 if len(b) < 5 { // 4 + min 1 for uvarint uint64 9242 return 0, nil 9243 } 9244 if b[0] != 'c' || b[1] != 'm' || b[2] != 'p' { 9245 return 0, nil 9246 } 9247 var n int 9248 c.Algorithm = StoreCompression(b[3]) 9249 c.OriginalSize, n = binary.Uvarint(b[4:]) 9250 if n <= 0 { 9251 return 0, fmt.Errorf("metadata incomplete") 9252 } 9253 return 4 + n, nil 9254 } 9255 9256 func (alg StoreCompression) Compress(buf []byte) ([]byte, error) { 9257 if len(buf) < checksumSize { 9258 return nil, fmt.Errorf("uncompressed buffer is too short") 9259 } 9260 bodyLen := int64(len(buf) - checksumSize) 9261 var output bytes.Buffer 9262 var writer io.WriteCloser 9263 switch alg { 9264 case NoCompression: 9265 return buf, nil 9266 case S2Compression: 9267 writer = s2.NewWriter(&output) 9268 default: 9269 return nil, fmt.Errorf("compression algorithm not known") 9270 } 9271 9272 input := bytes.NewReader(buf[:bodyLen]) 9273 checksum := buf[bodyLen:] 9274 9275 // Compress the block content, but don't compress the checksum. 9276 // We will preserve it at the end of the block as-is. 9277 if n, err := io.CopyN(writer, input, bodyLen); err != nil { 9278 return nil, fmt.Errorf("error writing to compression writer: %w", err) 9279 } else if n != bodyLen { 9280 return nil, fmt.Errorf("short write on body (%d != %d)", n, bodyLen) 9281 } 9282 if err := writer.Close(); err != nil { 9283 return nil, fmt.Errorf("error closing compression writer: %w", err) 9284 } 9285 9286 // Now add the checksum back onto the end of the block. 9287 if n, err := output.Write(checksum); err != nil { 9288 return nil, fmt.Errorf("error writing checksum: %w", err) 9289 } else if n != checksumSize { 9290 return nil, fmt.Errorf("short write on checksum (%d != %d)", n, checksumSize) 9291 } 9292 9293 return output.Bytes(), nil 9294 } 9295 9296 func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) { 9297 if len(buf) < checksumSize { 9298 return nil, fmt.Errorf("compressed buffer is too short") 9299 } 9300 bodyLen := int64(len(buf) - checksumSize) 9301 input := bytes.NewReader(buf[:bodyLen]) 9302 9303 var reader io.ReadCloser 9304 switch alg { 9305 case NoCompression: 9306 return buf, nil 9307 case S2Compression: 9308 reader = io.NopCloser(s2.NewReader(input)) 9309 default: 9310 return nil, fmt.Errorf("compression algorithm not known") 9311 } 9312 9313 // Decompress the block content. The checksum isn't compressed so 9314 // we can preserve it from the end of the block as-is. 9315 checksum := buf[bodyLen:] 9316 output, err := io.ReadAll(reader) 9317 if err != nil { 9318 return nil, fmt.Errorf("error reading compression reader: %w", err) 9319 } 9320 output = append(output, checksum...) 9321 9322 return output, reader.Close() 9323 }