github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/filestore.go (about) 1 // Copyright 2019-2024 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package server 15 16 import ( 17 "archive/tar" 18 "bytes" 19 "crypto/aes" 20 "crypto/cipher" 21 "crypto/rand" 22 "crypto/sha256" 23 "encoding/binary" 24 "encoding/hex" 25 "encoding/json" 26 "errors" 27 "fmt" 28 "hash" 29 "io" 30 "math" 31 "net" 32 "os" 33 "path/filepath" 34 "sort" 35 "strings" 36 "sync" 37 "sync/atomic" 38 "time" 39 40 "github.com/klauspost/compress/s2" 41 "github.com/minio/highwayhash" 42 "github.com/nats-io/nats-server/v2/server/avl" 43 "github.com/nats-io/nats-server/v2/server/stree" 44 "golang.org/x/crypto/chacha20" 45 "golang.org/x/crypto/chacha20poly1305" 46 ) 47 48 type FileStoreConfig struct { 49 // Where the parent directory for all storage will be located. 50 StoreDir string 51 // BlockSize is the file block size. This also represents the maximum overhead size. 52 BlockSize uint64 53 // CacheExpire is how long with no activity until we expire the cache. 54 CacheExpire time.Duration 55 // SyncInterval is how often we sync to disk in the background. 56 SyncInterval time.Duration 57 // SyncAlways is when the stream should sync all data writes. 58 SyncAlways bool 59 // AsyncFlush allows async flush to batch write operations. 60 AsyncFlush bool 61 // Cipher is the cipher to use when encrypting. 62 Cipher StoreCipher 63 // Compression is the algorithm to use when compressing. 64 Compression StoreCompression 65 66 // Internal reference to our server. 67 srv *Server 68 } 69 70 // FileStreamInfo allows us to remember created time. 71 type FileStreamInfo struct { 72 Created time.Time 73 StreamConfig 74 } 75 76 type StoreCipher int 77 78 const ( 79 ChaCha StoreCipher = iota 80 AES 81 NoCipher 82 ) 83 84 func (cipher StoreCipher) String() string { 85 switch cipher { 86 case ChaCha: 87 return "ChaCha20-Poly1305" 88 case AES: 89 return "AES-GCM" 90 case NoCipher: 91 return "None" 92 default: 93 return "Unknown StoreCipher" 94 } 95 } 96 97 type StoreCompression uint8 98 99 const ( 100 NoCompression StoreCompression = iota 101 S2Compression 102 ) 103 104 func (alg StoreCompression) String() string { 105 switch alg { 106 case NoCompression: 107 return "None" 108 case S2Compression: 109 return "S2" 110 default: 111 return "Unknown StoreCompression" 112 } 113 } 114 115 func (alg StoreCompression) MarshalJSON() ([]byte, error) { 116 var str string 117 switch alg { 118 case S2Compression: 119 str = "s2" 120 case NoCompression: 121 str = "none" 122 default: 123 return nil, fmt.Errorf("unknown compression algorithm") 124 } 125 return json.Marshal(str) 126 } 127 128 func (alg *StoreCompression) UnmarshalJSON(b []byte) error { 129 var str string 130 if err := json.Unmarshal(b, &str); err != nil { 131 return err 132 } 133 switch str { 134 case "s2": 135 *alg = S2Compression 136 case "none": 137 *alg = NoCompression 138 default: 139 return fmt.Errorf("unknown compression algorithm") 140 } 141 return nil 142 } 143 144 // File ConsumerInfo is used for creating consumer stores. 145 type FileConsumerInfo struct { 146 Created time.Time 147 Name string 148 ConsumerConfig 149 } 150 151 // Default file and directory permissions. 152 const ( 153 defaultDirPerms = os.FileMode(0750) 154 defaultFilePerms = os.FileMode(0640) 155 ) 156 157 type psi struct { 158 total uint64 159 fblk uint32 160 lblk uint32 161 } 162 163 type fileStore struct { 164 srv *Server 165 mu sync.RWMutex 166 state StreamState 167 tombs []uint64 168 ld *LostStreamData 169 scb StorageUpdateHandler 170 ageChk *time.Timer 171 syncTmr *time.Timer 172 cfg FileStreamInfo 173 fcfg FileStoreConfig 174 prf keyGen 175 oldprf keyGen 176 aek cipher.AEAD 177 lmb *msgBlock 178 blks []*msgBlock 179 bim map[uint32]*msgBlock 180 psim *stree.SubjectTree[psi] 181 tsl int 182 adml int 183 hh hash.Hash64 184 qch chan struct{} 185 fsld chan struct{} 186 cmu sync.RWMutex 187 cfs []ConsumerStore 188 sips int 189 dirty int 190 closing bool 191 closed bool 192 fip bool 193 receivedAny bool 194 } 195 196 // Represents a message store block and its data. 197 type msgBlock struct { 198 // Here for 32bit systems and atomic. 199 first msgId 200 last msgId 201 mu sync.RWMutex 202 fs *fileStore 203 aek cipher.AEAD 204 bek cipher.Stream 205 seed []byte 206 nonce []byte 207 mfn string 208 mfd *os.File 209 cmp StoreCompression // Effective compression at the time of loading the block 210 liwsz int64 211 index uint32 212 bytes uint64 // User visible bytes count. 213 rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk. 214 msgs uint64 // User visible message count. 215 fss map[string]*SimpleState 216 kfn string 217 lwts int64 218 llts int64 219 lrts int64 220 llseq uint64 221 hh hash.Hash64 222 cache *cache 223 cloads uint64 224 cexp time.Duration 225 ctmr *time.Timer 226 werr error 227 dmap avl.SequenceSet 228 fch chan struct{} 229 qch chan struct{} 230 lchk [8]byte 231 loading bool 232 flusher bool 233 noTrack bool 234 needSync bool 235 syncAlways bool 236 closed bool 237 238 // Used to mock write failures. 239 mockWriteErr bool 240 } 241 242 // Write through caching layer that is also used on loading messages. 243 type cache struct { 244 buf []byte 245 off int 246 wp int 247 idx []uint32 248 lrl uint32 249 fseq uint64 250 nra bool 251 } 252 253 type msgId struct { 254 seq uint64 255 ts int64 256 } 257 258 const ( 259 // Magic is used to identify the file store files. 260 magic = uint8(22) 261 // Version 262 version = uint8(1) 263 // New IndexInfo Version 264 newVersion = uint8(2) 265 // hdrLen 266 hdrLen = 2 267 // This is where we keep the streams. 268 streamsDir = "streams" 269 // This is where we keep the message store blocks. 270 msgDir = "msgs" 271 // This is where we temporarily move the messages dir. 272 purgeDir = "__msgs__" 273 // used to scan blk file names. 274 blkScan = "%d.blk" 275 // used for compacted blocks that are staged. 276 newScan = "%d.new" 277 // used to scan index file names. 278 indexScan = "%d.idx" 279 // used to store our block encryption key. 280 keyScan = "%d.key" 281 // to look for orphans 282 keyScanAll = "*.key" 283 // This is where we keep state on consumers. 284 consumerDir = "obs" 285 // Index file for a consumer. 286 consumerState = "o.dat" 287 // The suffix that will be given to a new temporary block during compression. 288 compressTmpSuffix = ".tmp" 289 // This is where we keep state on templates. 290 tmplsDir = "templates" 291 // Maximum size of a write buffer we may consider for re-use. 292 maxBufReuse = 2 * 1024 * 1024 293 // default cache buffer expiration 294 defaultCacheBufferExpiration = 2 * time.Second 295 // default sync interval 296 defaultSyncInterval = 2 * time.Minute 297 // default idle timeout to close FDs. 298 closeFDsIdle = 30 * time.Second 299 // coalesceMinimum 300 coalesceMinimum = 16 * 1024 301 // maxFlushWait is maximum we will wait to gather messages to flush. 302 maxFlushWait = 8 * time.Millisecond 303 304 // Metafiles for streams and consumers. 305 JetStreamMetaFile = "meta.inf" 306 JetStreamMetaFileSum = "meta.sum" 307 JetStreamMetaFileKey = "meta.key" 308 309 // This is the full snapshotted state for the stream. 310 streamStreamStateFile = "index.db" 311 312 // AEK key sizes 313 minMetaKeySize = 64 314 minBlkKeySize = 64 315 316 // Default stream block size. 317 defaultLargeBlockSize = 8 * 1024 * 1024 // 8MB 318 // Default for workqueue or interest based. 319 defaultMediumBlockSize = 4 * 1024 * 1024 // 4MB 320 // For smaller reuse buffers. Usually being generated during contention on the lead write buffer. 321 // E.g. mirrors/sources etc. 322 defaultSmallBlockSize = 1 * 1024 * 1024 // 1MB 323 // Maximum size for the encrypted head block. 324 maximumEncryptedBlockSize = 2 * 1024 * 1024 // 2MB 325 // Default for KV based 326 defaultKVBlockSize = defaultMediumBlockSize 327 // max block size for now. 328 maxBlockSize = defaultLargeBlockSize 329 // Compact minimum threshold. 330 compactMinimum = 2 * 1024 * 1024 // 2MB 331 // FileStoreMinBlkSize is minimum size we will do for a blk size. 332 FileStoreMinBlkSize = 32 * 1000 // 32kib 333 // FileStoreMaxBlkSize is maximum size we will do for a blk size. 334 FileStoreMaxBlkSize = maxBlockSize 335 // Check for bad record length value due to corrupt data. 336 rlBadThresh = 32 * 1024 * 1024 337 // Checksum size for hash for msg records. 338 recordHashSize = 8 339 ) 340 341 func newFileStore(fcfg FileStoreConfig, cfg StreamConfig) (*fileStore, error) { 342 return newFileStoreWithCreated(fcfg, cfg, time.Now().UTC(), nil, nil) 343 } 344 345 func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created time.Time, prf, oldprf keyGen) (*fileStore, error) { 346 if cfg.Name == _EMPTY_ { 347 return nil, fmt.Errorf("name required") 348 } 349 if cfg.Storage != FileStorage { 350 return nil, fmt.Errorf("fileStore requires file storage type in config") 351 } 352 // Default values. 353 if fcfg.BlockSize == 0 { 354 fcfg.BlockSize = dynBlkSize(cfg.Retention, cfg.MaxBytes, prf != nil) 355 } 356 if fcfg.BlockSize > maxBlockSize { 357 return nil, fmt.Errorf("filestore max block size is %s", friendlyBytes(maxBlockSize)) 358 } 359 if fcfg.CacheExpire == 0 { 360 fcfg.CacheExpire = defaultCacheBufferExpiration 361 } 362 if fcfg.SyncInterval == 0 { 363 fcfg.SyncInterval = defaultSyncInterval 364 } 365 366 // Check the directory 367 if stat, err := os.Stat(fcfg.StoreDir); os.IsNotExist(err) { 368 if err := os.MkdirAll(fcfg.StoreDir, defaultDirPerms); err != nil { 369 return nil, fmt.Errorf("could not create storage directory - %v", err) 370 } 371 } else if stat == nil || !stat.IsDir() { 372 return nil, fmt.Errorf("storage directory is not a directory") 373 } 374 tmpfile, err := os.CreateTemp(fcfg.StoreDir, "_test_") 375 if err != nil { 376 return nil, fmt.Errorf("storage directory is not writable") 377 } 378 379 tmpfile.Close() 380 <-dios 381 os.Remove(tmpfile.Name()) 382 dios <- struct{}{} 383 384 fs := &fileStore{ 385 fcfg: fcfg, 386 psim: stree.NewSubjectTree[psi](), 387 bim: make(map[uint32]*msgBlock), 388 cfg: FileStreamInfo{Created: created, StreamConfig: cfg}, 389 prf: prf, 390 oldprf: oldprf, 391 qch: make(chan struct{}), 392 fsld: make(chan struct{}), 393 srv: fcfg.srv, 394 } 395 396 // Set flush in place to AsyncFlush which by default is false. 397 fs.fip = !fcfg.AsyncFlush 398 399 // Check if this is a new setup. 400 mdir := filepath.Join(fcfg.StoreDir, msgDir) 401 odir := filepath.Join(fcfg.StoreDir, consumerDir) 402 if err := os.MkdirAll(mdir, defaultDirPerms); err != nil { 403 return nil, fmt.Errorf("could not create message storage directory - %v", err) 404 } 405 if err := os.MkdirAll(odir, defaultDirPerms); err != nil { 406 return nil, fmt.Errorf("could not create consumer storage directory - %v", err) 407 } 408 409 // Create highway hash for message blocks. Use sha256 of directory as key. 410 key := sha256.Sum256([]byte(cfg.Name)) 411 fs.hh, err = highwayhash.New64(key[:]) 412 if err != nil { 413 return nil, fmt.Errorf("could not create hash: %v", err) 414 } 415 416 keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey) 417 // Make sure we do not have an encrypted store underneath of us but no main key. 418 if fs.prf == nil { 419 if _, err := os.Stat(keyFile); err == nil { 420 return nil, errNoMainKey 421 } 422 } 423 424 // Attempt to recover our state. 425 err = fs.recoverFullState() 426 if err != nil { 427 // Hold onto state 428 prior := fs.state 429 // Reset anything that could have been set from above. 430 fs.state = StreamState{} 431 fs.psim, fs.tsl = fs.psim.Empty(), 0 432 fs.bim = make(map[uint32]*msgBlock) 433 fs.blks = nil 434 fs.tombs = nil 435 436 // Recover our message state the old way 437 if err := fs.recoverMsgs(); err != nil { 438 return nil, err 439 } 440 441 // Check if our prior state remembers a last sequence past where we can see. 442 if fs.ld != nil && prior.LastSeq > fs.state.LastSeq { 443 fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime 444 if lmb, err := fs.newMsgBlockForWrite(); err == nil { 445 lmb.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano()) 446 } else { 447 return nil, err 448 } 449 } 450 // Since we recovered here, make sure to kick ourselves to write out our stream state. 451 fs.dirty++ 452 } 453 454 // Also make sure we get rid of old idx and fss files on return. 455 // Do this in separate go routine vs inline and at end of processing. 456 defer func() { 457 go fs.cleanupOldMeta() 458 }() 459 460 // Lock while do enforcements and removals. 461 fs.mu.Lock() 462 463 // Check if we have any left over tombstones to process. 464 if len(fs.tombs) > 0 { 465 for _, seq := range fs.tombs { 466 fs.removeMsg(seq, false, true, false) 467 fs.removeFromLostData(seq) 468 } 469 // Not needed after this phase. 470 fs.tombs = nil 471 } 472 473 // Limits checks and enforcement. 474 fs.enforceMsgLimit() 475 fs.enforceBytesLimit() 476 477 // Do age checks too, make sure to call in place. 478 if fs.cfg.MaxAge != 0 { 479 fs.expireMsgsOnRecover() 480 fs.startAgeChk() 481 } 482 483 // If we have max msgs per subject make sure the is also enforced. 484 if fs.cfg.MaxMsgsPer > 0 { 485 fs.enforceMsgPerSubjectLimit(false) 486 } 487 488 // Grab first sequence for check below while we have lock. 489 firstSeq := fs.state.FirstSeq 490 fs.mu.Unlock() 491 492 // If the stream has an initial sequence number then make sure we 493 // have purged up until that point. We will do this only if the 494 // recovered first sequence number is before our configured first 495 // sequence. Need to do this locked as by now the age check timer 496 // has started. 497 if cfg.FirstSeq > 0 && firstSeq <= cfg.FirstSeq { 498 if _, err := fs.purge(cfg.FirstSeq); err != nil { 499 return nil, err 500 } 501 } 502 503 // Write our meta data if it does not exist or is zero'd out. 504 meta := filepath.Join(fcfg.StoreDir, JetStreamMetaFile) 505 fi, err := os.Stat(meta) 506 if err != nil && os.IsNotExist(err) || fi != nil && fi.Size() == 0 { 507 if err := fs.writeStreamMeta(); err != nil { 508 return nil, err 509 } 510 } 511 512 // If we expect to be encrypted check that what we are restoring is not plaintext. 513 // This can happen on snapshot restores or conversions. 514 if fs.prf != nil { 515 if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) { 516 if err := fs.writeStreamMeta(); err != nil { 517 return nil, err 518 } 519 } 520 } 521 522 // Setup our sync timer. 523 fs.setSyncTimer() 524 525 // Spin up the go routine that will write out our full state stream index. 526 go fs.flushStreamStateLoop(fs.qch, fs.fsld) 527 528 return fs, nil 529 } 530 531 // Lock all existing message blocks. 532 // Lock held on entry. 533 func (fs *fileStore) lockAllMsgBlocks() { 534 for _, mb := range fs.blks { 535 mb.mu.Lock() 536 } 537 } 538 539 // Unlock all existing message blocks. 540 // Lock held on entry. 541 func (fs *fileStore) unlockAllMsgBlocks() { 542 for _, mb := range fs.blks { 543 mb.mu.Unlock() 544 } 545 } 546 547 func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error { 548 if fs.isClosed() { 549 return ErrStoreClosed 550 } 551 if cfg.Name == _EMPTY_ { 552 return fmt.Errorf("name required") 553 } 554 if cfg.Storage != FileStorage { 555 return fmt.Errorf("fileStore requires file storage type in config") 556 } 557 558 fs.mu.Lock() 559 new_cfg := FileStreamInfo{Created: fs.cfg.Created, StreamConfig: *cfg} 560 old_cfg := fs.cfg 561 // The reference story has changed here, so this full msg block lock 562 // may not be needed. 563 fs.lockAllMsgBlocks() 564 fs.cfg = new_cfg 565 fs.unlockAllMsgBlocks() 566 if err := fs.writeStreamMeta(); err != nil { 567 fs.lockAllMsgBlocks() 568 fs.cfg = old_cfg 569 fs.unlockAllMsgBlocks() 570 fs.mu.Unlock() 571 return err 572 } 573 574 // Limits checks and enforcement. 575 fs.enforceMsgLimit() 576 fs.enforceBytesLimit() 577 578 // Do age timers. 579 if fs.ageChk == nil && fs.cfg.MaxAge != 0 { 580 fs.startAgeChk() 581 } 582 if fs.ageChk != nil && fs.cfg.MaxAge == 0 { 583 fs.ageChk.Stop() 584 fs.ageChk = nil 585 } 586 587 if fs.cfg.MaxMsgsPer > 0 && fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer { 588 fs.enforceMsgPerSubjectLimit(true) 589 } 590 fs.mu.Unlock() 591 592 if cfg.MaxAge != 0 { 593 fs.expireMsgs() 594 } 595 return nil 596 } 597 598 func dynBlkSize(retention RetentionPolicy, maxBytes int64, encrypted bool) uint64 { 599 if maxBytes > 0 { 600 blkSize := (maxBytes / 4) + 1 // (25% overhead) 601 // Round up to nearest 100 602 if m := blkSize % 100; m != 0 { 603 blkSize += 100 - m 604 } 605 if blkSize <= FileStoreMinBlkSize { 606 blkSize = FileStoreMinBlkSize 607 } else if blkSize >= FileStoreMaxBlkSize { 608 blkSize = FileStoreMaxBlkSize 609 } else { 610 blkSize = defaultMediumBlockSize 611 } 612 if encrypted && blkSize > maximumEncryptedBlockSize { 613 // Notes on this below. 614 blkSize = maximumEncryptedBlockSize 615 } 616 return uint64(blkSize) 617 } 618 619 switch { 620 case encrypted: 621 // In the case of encrypted stores, large blocks can result in worsened perf 622 // since many writes on disk involve re-encrypting the entire block. For now, 623 // we will enforce a cap on the block size when encryption is enabled to avoid 624 // this. 625 return maximumEncryptedBlockSize 626 case retention == LimitsPolicy: 627 // TODO(dlc) - Make the blocksize relative to this if set. 628 return defaultLargeBlockSize 629 default: 630 // TODO(dlc) - Make the blocksize relative to this if set. 631 return defaultMediumBlockSize 632 } 633 } 634 635 func genEncryptionKey(sc StoreCipher, seed []byte) (ek cipher.AEAD, err error) { 636 if sc == ChaCha { 637 ek, err = chacha20poly1305.NewX(seed) 638 } else if sc == AES { 639 block, e := aes.NewCipher(seed) 640 if e != nil { 641 return nil, e 642 } 643 ek, err = cipher.NewGCMWithNonceSize(block, block.BlockSize()) 644 } else { 645 err = errUnknownCipher 646 } 647 return ek, err 648 } 649 650 // Generate an asset encryption key from the context and server PRF. 651 func (fs *fileStore) genEncryptionKeys(context string) (aek cipher.AEAD, bek cipher.Stream, seed, encrypted []byte, err error) { 652 if fs.prf == nil { 653 return nil, nil, nil, nil, errNoEncryption 654 } 655 // Generate key encryption key. 656 rb, err := fs.prf([]byte(context)) 657 if err != nil { 658 return nil, nil, nil, nil, err 659 } 660 661 sc := fs.fcfg.Cipher 662 663 kek, err := genEncryptionKey(sc, rb) 664 if err != nil { 665 return nil, nil, nil, nil, err 666 } 667 // Generate random asset encryption key seed. 668 669 const seedSize = 32 670 seed = make([]byte, seedSize) 671 if n, err := rand.Read(seed); err != nil { 672 return nil, nil, nil, nil, err 673 } else if n != seedSize { 674 return nil, nil, nil, nil, fmt.Errorf("not enough seed bytes read (%d != %d", n, seedSize) 675 } 676 677 aek, err = genEncryptionKey(sc, seed) 678 if err != nil { 679 return nil, nil, nil, nil, err 680 } 681 682 // Generate our nonce. Use same buffer to hold encrypted seed. 683 nonce := make([]byte, kek.NonceSize(), kek.NonceSize()+len(seed)+kek.Overhead()) 684 if n, err := rand.Read(nonce); err != nil { 685 return nil, nil, nil, nil, err 686 } else if n != len(nonce) { 687 return nil, nil, nil, nil, fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce)) 688 } 689 690 bek, err = genBlockEncryptionKey(sc, seed[:], nonce) 691 if err != nil { 692 return nil, nil, nil, nil, err 693 } 694 695 return aek, bek, seed, kek.Seal(nonce, nonce, seed, nil), nil 696 } 697 698 // Will generate the block encryption key. 699 func genBlockEncryptionKey(sc StoreCipher, seed, nonce []byte) (cipher.Stream, error) { 700 if sc == ChaCha { 701 return chacha20.NewUnauthenticatedCipher(seed, nonce) 702 } else if sc == AES { 703 block, err := aes.NewCipher(seed) 704 if err != nil { 705 return nil, err 706 } 707 return cipher.NewCTR(block, nonce), nil 708 } 709 return nil, errUnknownCipher 710 } 711 712 // Lock should be held. 713 func (fs *fileStore) recoverAEK() error { 714 if fs.prf != nil && fs.aek == nil { 715 ekey, err := os.ReadFile(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey)) 716 if err != nil { 717 return err 718 } 719 rb, err := fs.prf([]byte(fs.cfg.Name)) 720 if err != nil { 721 return err 722 } 723 kek, err := genEncryptionKey(fs.fcfg.Cipher, rb) 724 if err != nil { 725 return err 726 } 727 ns := kek.NonceSize() 728 seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) 729 if err != nil { 730 return err 731 } 732 aek, err := genEncryptionKey(fs.fcfg.Cipher, seed) 733 if err != nil { 734 return err 735 } 736 fs.aek = aek 737 } 738 return nil 739 } 740 741 // Lock should be held. 742 func (fs *fileStore) setupAEK() error { 743 if fs.prf != nil && fs.aek == nil { 744 key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name) 745 if err != nil { 746 return err 747 } 748 keyFile := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileKey) 749 if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) { 750 return err 751 } 752 <-dios 753 err = os.WriteFile(keyFile, encrypted, defaultFilePerms) 754 dios <- struct{}{} 755 if err != nil { 756 return err 757 } 758 // Set our aek. 759 fs.aek = key 760 } 761 return nil 762 } 763 764 // Write out meta and the checksum. 765 // Lock should be held. 766 func (fs *fileStore) writeStreamMeta() error { 767 if err := fs.setupAEK(); err != nil { 768 return err 769 } 770 771 meta := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile) 772 if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) { 773 return err 774 } 775 b, err := json.Marshal(fs.cfg) 776 if err != nil { 777 return err 778 } 779 // Encrypt if needed. 780 if fs.aek != nil { 781 nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(b)+fs.aek.Overhead()) 782 if n, err := rand.Read(nonce); err != nil { 783 return err 784 } else if n != len(nonce) { 785 return fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce)) 786 } 787 b = fs.aek.Seal(nonce, nonce, b, nil) 788 } 789 790 <-dios 791 err = os.WriteFile(meta, b, defaultFilePerms) 792 dios <- struct{}{} 793 if err != nil { 794 return err 795 } 796 fs.hh.Reset() 797 fs.hh.Write(b) 798 checksum := hex.EncodeToString(fs.hh.Sum(nil)) 799 sum := filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFileSum) 800 <-dios 801 err = os.WriteFile(sum, []byte(checksum), defaultFilePerms) 802 dios <- struct{}{} 803 if err != nil { 804 return err 805 } 806 return nil 807 } 808 809 // Pools to recycle the blocks to help with memory pressure. 810 var blkPoolBig sync.Pool // 16MB 811 var blkPoolMedium sync.Pool // 8MB 812 var blkPoolSmall sync.Pool // 2MB 813 814 // Get a new msg block based on sz estimate. 815 func getMsgBlockBuf(sz int) (buf []byte) { 816 var pb any 817 if sz <= defaultSmallBlockSize { 818 pb = blkPoolSmall.Get() 819 } else if sz <= defaultMediumBlockSize { 820 pb = blkPoolMedium.Get() 821 } else { 822 pb = blkPoolBig.Get() 823 } 824 if pb != nil { 825 buf = *(pb.(*[]byte)) 826 } else { 827 // Here we need to make a new blk. 828 // If small leave as is.. 829 if sz > defaultSmallBlockSize && sz <= defaultMediumBlockSize { 830 sz = defaultMediumBlockSize 831 } else if sz > defaultMediumBlockSize { 832 sz = defaultLargeBlockSize 833 } 834 buf = make([]byte, sz) 835 } 836 return buf[:0] 837 } 838 839 // Recycle the msg block. 840 func recycleMsgBlockBuf(buf []byte) { 841 if buf == nil || cap(buf) < defaultSmallBlockSize { 842 return 843 } 844 // Make sure to reset before placing back into pool. 845 buf = buf[:0] 846 847 // We need to make sure the load code gets a block that can fit the maximum for a size block. 848 // E.g. 8, 16 etc. otherwise we thrash and actually make things worse by pulling it out, and putting 849 // it right back in and making a new []byte. 850 // From above we know its already >= defaultSmallBlockSize 851 if sz := cap(buf); sz < defaultMediumBlockSize { 852 blkPoolSmall.Put(&buf) 853 } else if sz < defaultLargeBlockSize { 854 blkPoolMedium.Put(&buf) 855 } else { 856 blkPoolBig.Put(&buf) 857 } 858 } 859 860 const ( 861 msgHdrSize = 22 862 checksumSize = 8 863 emptyRecordLen = msgHdrSize + checksumSize 864 ) 865 866 // Lock should be held. 867 func (fs *fileStore) noTrackSubjects() bool { 868 return !(fs.psim.Size() > 0 || len(fs.cfg.Subjects) > 0 || fs.cfg.Mirror != nil || len(fs.cfg.Sources) > 0) 869 } 870 871 // Will init the basics for a message block. 872 func (fs *fileStore) initMsgBlock(index uint32) *msgBlock { 873 mb := &msgBlock{fs: fs, index: index, cexp: fs.fcfg.CacheExpire, noTrack: fs.noTrackSubjects(), syncAlways: fs.fcfg.SyncAlways} 874 875 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 876 mb.mfn = filepath.Join(mdir, fmt.Sprintf(blkScan, index)) 877 878 if mb.hh == nil { 879 key := sha256.Sum256(fs.hashKeyForBlock(index)) 880 mb.hh, _ = highwayhash.New64(key[:]) 881 } 882 return mb 883 } 884 885 // Lock for fs should be held. 886 func (fs *fileStore) loadEncryptionForMsgBlock(mb *msgBlock) error { 887 if fs.prf == nil { 888 return nil 889 } 890 891 var createdKeys bool 892 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 893 ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))) 894 if err != nil { 895 // We do not seem to have keys even though we should. Could be a plaintext conversion. 896 // Create the keys and we will double check below. 897 if err := fs.genEncryptionKeysForBlock(mb); err != nil { 898 return err 899 } 900 createdKeys = true 901 } else { 902 if len(ekey) < minBlkKeySize { 903 return errBadKeySize 904 } 905 // Recover key encryption key. 906 rb, err := fs.prf([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))) 907 if err != nil { 908 return err 909 } 910 911 sc := fs.fcfg.Cipher 912 kek, err := genEncryptionKey(sc, rb) 913 if err != nil { 914 return err 915 } 916 ns := kek.NonceSize() 917 seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) 918 if err != nil { 919 // We may be here on a cipher conversion, so attempt to convert. 920 if err = mb.convertCipher(); err != nil { 921 return err 922 } 923 } else { 924 mb.seed, mb.nonce = seed, ekey[:ns] 925 } 926 mb.aek, err = genEncryptionKey(sc, mb.seed) 927 if err != nil { 928 return err 929 } 930 if mb.bek, err = genBlockEncryptionKey(sc, mb.seed, mb.nonce); err != nil { 931 return err 932 } 933 } 934 935 // If we created keys here, let's check the data and if it is plaintext convert here. 936 if createdKeys { 937 if err := mb.convertToEncrypted(); err != nil { 938 return err 939 } 940 } 941 942 return nil 943 } 944 945 // Load a last checksum if needed from the block file. 946 // Lock should be held. 947 func (mb *msgBlock) ensureLastChecksumLoaded() { 948 var empty [8]byte 949 if mb.lchk != empty { 950 return 951 } 952 copy(mb.lchk[0:], mb.lastChecksum()) 953 } 954 955 // Lock held on entry 956 func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { 957 mb := fs.initMsgBlock(index) 958 959 // Open up the message file, but we will try to recover from the index file. 960 // We will check that the last checksums match. 961 file, err := mb.openBlock() 962 if err != nil { 963 return nil, err 964 } 965 defer file.Close() 966 967 if fi, err := file.Stat(); fi != nil { 968 mb.rbytes = uint64(fi.Size()) 969 } else { 970 return nil, err 971 } 972 973 // Make sure encryption loaded if needed. 974 fs.loadEncryptionForMsgBlock(mb) 975 976 // Grab last checksum from main block file. 977 var lchk [8]byte 978 if mb.rbytes >= checksumSize { 979 if mb.bek != nil { 980 if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize { 981 mb.bek.XORKeyStream(buf, buf) 982 copy(lchk[0:], buf[len(buf)-checksumSize:]) 983 } 984 } else { 985 file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) 986 } 987 } 988 989 file.Close() 990 991 // Read our index file. Use this as source of truth if possible. 992 // This not applicable in >= 2.10 servers. Here for upgrade paths from < 2.10. 993 if err := mb.readIndexInfo(); err == nil { 994 // Quick sanity check here. 995 // Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty. 996 if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) { 997 if mb.msgs > 0 && !mb.noTrack && fs.psim != nil { 998 fs.populateGlobalPerSubjectInfo(mb) 999 // Try to dump any state we needed on recovery. 1000 mb.tryForceExpireCacheLocked() 1001 } 1002 fs.addMsgBlock(mb) 1003 return mb, nil 1004 } 1005 } 1006 1007 // If we get data loss rebuilding the message block state record that with the fs itself. 1008 ld, tombs, _ := mb.rebuildState() 1009 if ld != nil { 1010 fs.addLostData(ld) 1011 } 1012 // Collect all tombstones. 1013 if len(tombs) > 0 { 1014 fs.tombs = append(fs.tombs, tombs...) 1015 } 1016 1017 if mb.msgs > 0 && !mb.noTrack && fs.psim != nil { 1018 fs.populateGlobalPerSubjectInfo(mb) 1019 // Try to dump any state we needed on recovery. 1020 mb.tryForceExpireCacheLocked() 1021 } 1022 1023 mb.closeFDs() 1024 fs.addMsgBlock(mb) 1025 1026 return mb, nil 1027 } 1028 1029 func (fs *fileStore) lostData() *LostStreamData { 1030 fs.mu.RLock() 1031 defer fs.mu.RUnlock() 1032 if fs.ld == nil { 1033 return nil 1034 } 1035 nld := *fs.ld 1036 return &nld 1037 } 1038 1039 // Lock should be held. 1040 func (fs *fileStore) addLostData(ld *LostStreamData) { 1041 if ld == nil { 1042 return 1043 } 1044 if fs.ld != nil { 1045 var added bool 1046 for _, seq := range ld.Msgs { 1047 if _, found := fs.ld.exists(seq); !found { 1048 fs.ld.Msgs = append(fs.ld.Msgs, seq) 1049 added = true 1050 } 1051 } 1052 if added { 1053 msgs := fs.ld.Msgs 1054 sort.Slice(msgs, func(i, j int) bool { return msgs[i] < msgs[j] }) 1055 fs.ld.Bytes += ld.Bytes 1056 } 1057 } else { 1058 fs.ld = ld 1059 } 1060 } 1061 1062 // Helper to see if we already have this sequence reported in our lost data. 1063 func (ld *LostStreamData) exists(seq uint64) (int, bool) { 1064 i, found := sort.Find(len(ld.Msgs), func(i int) int { 1065 tseq := ld.Msgs[i] 1066 if tseq < seq { 1067 return -1 1068 } 1069 if tseq > seq { 1070 return +1 1071 } 1072 return 0 1073 }) 1074 return i, found 1075 } 1076 1077 func (fs *fileStore) removeFromLostData(seq uint64) { 1078 if fs.ld == nil { 1079 return 1080 } 1081 if i, found := fs.ld.exists(seq); found { 1082 fs.ld.Msgs = append(fs.ld.Msgs[:i], fs.ld.Msgs[i+1:]...) 1083 if len(fs.ld.Msgs) == 0 { 1084 fs.ld = nil 1085 } 1086 } 1087 } 1088 1089 func (fs *fileStore) rebuildState(ld *LostStreamData) { 1090 fs.mu.Lock() 1091 defer fs.mu.Unlock() 1092 fs.rebuildStateLocked(ld) 1093 } 1094 1095 // Lock should be held. 1096 func (fs *fileStore) rebuildStateLocked(ld *LostStreamData) { 1097 fs.addLostData(ld) 1098 1099 fs.state.Msgs, fs.state.Bytes = 0, 0 1100 fs.state.FirstSeq, fs.state.LastSeq = 0, 0 1101 1102 for _, mb := range fs.blks { 1103 mb.mu.RLock() 1104 fs.state.Msgs += mb.msgs 1105 fs.state.Bytes += mb.bytes 1106 fseq := atomic.LoadUint64(&mb.first.seq) 1107 if fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { 1108 fs.state.FirstSeq = fseq 1109 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 1110 } 1111 fs.state.LastSeq = atomic.LoadUint64(&mb.last.seq) 1112 fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() 1113 mb.mu.RUnlock() 1114 } 1115 } 1116 1117 // Attempt to convert the cipher used for this message block. 1118 func (mb *msgBlock) convertCipher() error { 1119 fs := mb.fs 1120 sc := fs.fcfg.Cipher 1121 1122 var osc StoreCipher 1123 switch sc { 1124 case ChaCha: 1125 osc = AES 1126 case AES: 1127 osc = ChaCha 1128 } 1129 1130 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 1131 ekey, err := os.ReadFile(filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index))) 1132 if err != nil { 1133 return err 1134 } 1135 if len(ekey) < minBlkKeySize { 1136 return errBadKeySize 1137 } 1138 type prfWithCipher struct { 1139 keyGen 1140 StoreCipher 1141 } 1142 var prfs []prfWithCipher 1143 if fs.prf != nil { 1144 prfs = append(prfs, prfWithCipher{fs.prf, sc}) 1145 prfs = append(prfs, prfWithCipher{fs.prf, osc}) 1146 } 1147 if fs.oldprf != nil { 1148 prfs = append(prfs, prfWithCipher{fs.oldprf, sc}) 1149 prfs = append(prfs, prfWithCipher{fs.oldprf, osc}) 1150 } 1151 1152 for _, prf := range prfs { 1153 // Recover key encryption key. 1154 rb, err := prf.keyGen([]byte(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index))) 1155 if err != nil { 1156 continue 1157 } 1158 kek, err := genEncryptionKey(prf.StoreCipher, rb) 1159 if err != nil { 1160 continue 1161 } 1162 ns := kek.NonceSize() 1163 seed, err := kek.Open(nil, ekey[:ns], ekey[ns:], nil) 1164 if err != nil { 1165 continue 1166 } 1167 nonce := ekey[:ns] 1168 bek, err := genBlockEncryptionKey(prf.StoreCipher, seed, nonce) 1169 if err != nil { 1170 return err 1171 } 1172 1173 buf, _ := mb.loadBlock(nil) 1174 bek.XORKeyStream(buf, buf) 1175 // Make sure we can parse with old cipher and key file. 1176 if err = mb.indexCacheBuf(buf); err != nil { 1177 return err 1178 } 1179 // Reset the cache since we just read everything in. 1180 mb.cache = nil 1181 1182 // Generate new keys. If we error for some reason then we will put 1183 // the old keyfile back. 1184 if err := fs.genEncryptionKeysForBlock(mb); err != nil { 1185 keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)) 1186 <-dios 1187 os.WriteFile(keyFile, ekey, defaultFilePerms) 1188 dios <- struct{}{} 1189 return err 1190 } 1191 mb.bek.XORKeyStream(buf, buf) 1192 <-dios 1193 err = os.WriteFile(mb.mfn, buf, defaultFilePerms) 1194 dios <- struct{}{} 1195 if err != nil { 1196 return err 1197 } 1198 return nil 1199 } 1200 return fmt.Errorf("unable to recover keys") 1201 } 1202 1203 // Convert a plaintext block to encrypted. 1204 func (mb *msgBlock) convertToEncrypted() error { 1205 if mb.bek == nil { 1206 return nil 1207 } 1208 buf, err := mb.loadBlock(nil) 1209 if err != nil { 1210 return err 1211 } 1212 if err := mb.indexCacheBuf(buf); err != nil { 1213 // This likely indicates this was already encrypted or corrupt. 1214 mb.cache = nil 1215 return err 1216 } 1217 // Undo cache from above for later. 1218 mb.cache = nil 1219 mb.bek.XORKeyStream(buf, buf) 1220 <-dios 1221 err = os.WriteFile(mb.mfn, buf, defaultFilePerms) 1222 dios <- struct{}{} 1223 if err != nil { 1224 return err 1225 } 1226 return nil 1227 } 1228 1229 // Rebuild the state of the blk based on what we have on disk in the N.blk file. 1230 // We will return any lost data, and we will return any delete tombstones we encountered. 1231 func (mb *msgBlock) rebuildState() (*LostStreamData, []uint64, error) { 1232 mb.mu.Lock() 1233 defer mb.mu.Unlock() 1234 return mb.rebuildStateLocked() 1235 } 1236 1237 // Rebuild the state of the blk based on what we have on disk in the N.blk file. 1238 // Lock should be held. 1239 func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { 1240 startLastSeq := atomic.LoadUint64(&mb.last.seq) 1241 1242 // Remove the .fss file and clear any cache we have set. 1243 mb.clearCacheAndOffset() 1244 1245 buf, err := mb.loadBlock(nil) 1246 defer recycleMsgBlockBuf(buf) 1247 1248 if err != nil || len(buf) == 0 { 1249 var ld *LostStreamData 1250 // No data to rebuild from here. 1251 if mb.msgs > 0 { 1252 // We need to declare lost data here. 1253 ld = &LostStreamData{Msgs: make([]uint64, 0, mb.msgs), Bytes: mb.bytes} 1254 firstSeq, lastSeq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 1255 for seq := firstSeq; seq <= lastSeq; seq++ { 1256 if !mb.dmap.Exists(seq) { 1257 ld.Msgs = append(ld.Msgs, seq) 1258 } 1259 } 1260 // Clear invalid state. We will let this blk be added in here. 1261 mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil 1262 mb.dmap.Empty() 1263 atomic.StoreUint64(&mb.first.seq, atomic.LoadUint64(&mb.last.seq)+1) 1264 } 1265 return ld, nil, err 1266 } 1267 1268 // Clear state we need to rebuild. 1269 mb.msgs, mb.bytes, mb.rbytes, mb.fss = 0, 0, 0, nil 1270 atomic.StoreUint64(&mb.last.seq, 0) 1271 mb.last.ts = 0 1272 firstNeedsSet := true 1273 1274 // Check if we need to decrypt. 1275 if mb.bek != nil && len(buf) > 0 { 1276 // Recreate to reset counter. 1277 mb.bek, err = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 1278 if err != nil { 1279 return nil, nil, err 1280 } 1281 mb.bek.XORKeyStream(buf, buf) 1282 } 1283 1284 // Check for compression. 1285 if buf, err = mb.decompressIfNeeded(buf); err != nil { 1286 return nil, nil, err 1287 } 1288 1289 mb.rbytes = uint64(len(buf)) 1290 1291 addToDmap := func(seq uint64) { 1292 if seq == 0 { 1293 return 1294 } 1295 mb.dmap.Insert(seq) 1296 } 1297 1298 var le = binary.LittleEndian 1299 1300 truncate := func(index uint32) { 1301 var fd *os.File 1302 if mb.mfd != nil { 1303 fd = mb.mfd 1304 } else { 1305 <-dios 1306 fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms) 1307 dios <- struct{}{} 1308 if err == nil { 1309 defer fd.Close() 1310 } 1311 } 1312 if fd == nil { 1313 return 1314 } 1315 if err := fd.Truncate(int64(index)); err == nil { 1316 // Update our checksum. 1317 if index >= 8 { 1318 var lchk [8]byte 1319 fd.ReadAt(lchk[:], int64(index-8)) 1320 copy(mb.lchk[0:], lchk[:]) 1321 } 1322 fd.Sync() 1323 } 1324 } 1325 1326 gatherLost := func(lb uint32) *LostStreamData { 1327 var ld LostStreamData 1328 for seq := atomic.LoadUint64(&mb.last.seq) + 1; seq <= startLastSeq; seq++ { 1329 ld.Msgs = append(ld.Msgs, seq) 1330 } 1331 ld.Bytes = uint64(lb) 1332 return &ld 1333 } 1334 1335 // For tombstones that we find and collect. 1336 var ( 1337 tombstones []uint64 1338 minTombstoneSeq uint64 1339 minTombstoneTs int64 1340 ) 1341 1342 for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { 1343 if index+msgHdrSize > lbuf { 1344 truncate(index) 1345 return gatherLost(lbuf - index), tombstones, nil 1346 } 1347 1348 hdr := buf[index : index+msgHdrSize] 1349 rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:]) 1350 1351 hasHeaders := rl&hbit != 0 1352 // Clear any headers bit that could be set. 1353 rl &^= hbit 1354 dlen := int(rl) - msgHdrSize 1355 // Do some quick sanity checks here. 1356 if dlen < 0 || int(slen) > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { 1357 truncate(index) 1358 return gatherLost(lbuf - index), tombstones, errBadMsg 1359 } 1360 1361 // Check for checksum failures before additional processing. 1362 data := buf[index+msgHdrSize : index+rl] 1363 if hh := mb.hh; hh != nil { 1364 hh.Reset() 1365 hh.Write(hdr[4:20]) 1366 hh.Write(data[:slen]) 1367 if hasHeaders { 1368 hh.Write(data[slen+4 : dlen-recordHashSize]) 1369 } else { 1370 hh.Write(data[slen : dlen-recordHashSize]) 1371 } 1372 checksum := hh.Sum(nil) 1373 if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) { 1374 truncate(index) 1375 return gatherLost(lbuf - index), tombstones, errBadMsg 1376 } 1377 copy(mb.lchk[0:], checksum) 1378 } 1379 1380 // Grab our sequence and timestamp. 1381 seq := le.Uint64(hdr[4:]) 1382 ts := int64(le.Uint64(hdr[12:])) 1383 1384 // Check if this is a delete tombstone. 1385 if seq&tbit != 0 { 1386 seq = seq &^ tbit 1387 // Need to process this here and make sure we have accounted for this properly. 1388 tombstones = append(tombstones, seq) 1389 if minTombstoneSeq == 0 || seq < minTombstoneSeq { 1390 minTombstoneSeq, minTombstoneTs = seq, ts 1391 } 1392 index += rl 1393 continue 1394 } 1395 1396 fseq := atomic.LoadUint64(&mb.first.seq) 1397 // This is an old erased message, or a new one that we can track. 1398 if seq == 0 || seq&ebit != 0 || seq < fseq { 1399 seq = seq &^ ebit 1400 if seq >= fseq { 1401 // Only add to dmap if past recorded first seq and non-zero. 1402 if seq != 0 { 1403 addToDmap(seq) 1404 } 1405 atomic.StoreUint64(&mb.last.seq, seq) 1406 mb.last.ts = ts 1407 if mb.msgs == 0 { 1408 atomic.StoreUint64(&mb.first.seq, seq+1) 1409 mb.first.ts = 0 1410 } 1411 } 1412 index += rl 1413 continue 1414 } 1415 1416 // This is for when we have index info that adjusts for deleted messages 1417 // at the head. So the first.seq will be already set here. If this is larger 1418 // replace what we have with this seq. 1419 if firstNeedsSet && seq >= fseq { 1420 atomic.StoreUint64(&mb.first.seq, seq) 1421 firstNeedsSet, mb.first.ts = false, ts 1422 } 1423 1424 if !mb.dmap.Exists(seq) { 1425 mb.msgs++ 1426 mb.bytes += uint64(rl) 1427 } 1428 1429 // Always set last 1430 atomic.StoreUint64(&mb.last.seq, seq) 1431 mb.last.ts = ts 1432 1433 // Advance to next record. 1434 index += rl 1435 } 1436 1437 // For empty msg blocks make sure we recover last seq correctly based off of first. 1438 // Or if we seem to have no messages but had a tombstone, which we use to remember 1439 // sequences and timestamps now, use that to properly setup the first and last. 1440 if mb.msgs == 0 { 1441 fseq := atomic.LoadUint64(&mb.first.seq) 1442 if fseq > 0 { 1443 atomic.StoreUint64(&mb.last.seq, fseq-1) 1444 } else if fseq == 0 && minTombstoneSeq > 0 { 1445 atomic.StoreUint64(&mb.first.seq, minTombstoneSeq+1) 1446 mb.first.ts = 0 1447 if mb.last.seq == 0 { 1448 atomic.StoreUint64(&mb.last.seq, minTombstoneSeq) 1449 mb.last.ts = minTombstoneTs 1450 } 1451 } 1452 } 1453 1454 return nil, tombstones, nil 1455 } 1456 1457 // For doing warn logging. 1458 // Lock should be held. 1459 func (fs *fileStore) warn(format string, args ...any) { 1460 // No-op if no server configured. 1461 if fs.srv == nil { 1462 return 1463 } 1464 fs.srv.Warnf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...) 1465 } 1466 1467 // For doing debug logging. 1468 // Lock should be held. 1469 func (fs *fileStore) debug(format string, args ...any) { 1470 // No-op if no server configured. 1471 if fs.srv == nil { 1472 return 1473 } 1474 fs.srv.Debugf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...) 1475 } 1476 1477 // Track local state but ignore timestamps here. 1478 func updateTrackingState(state *StreamState, mb *msgBlock) { 1479 if state.FirstSeq == 0 { 1480 state.FirstSeq = mb.first.seq 1481 } else if mb.first.seq < state.FirstSeq { 1482 state.FirstSeq = mb.first.seq 1483 } 1484 if mb.last.seq > state.LastSeq { 1485 state.LastSeq = mb.last.seq 1486 } 1487 state.Msgs += mb.msgs 1488 state.Bytes += mb.bytes 1489 } 1490 1491 // Determine if our tracking states are the same. 1492 func trackingStatesEqual(fs, mb *StreamState) bool { 1493 // When a fs is brand new the fs state will have first seq of 0, but tracking mb may have 1. 1494 // If either has a first sequence that is not 0 or 1 we will check if they are the same, otherwise skip. 1495 if (fs.FirstSeq > 1 && mb.FirstSeq > 1) || mb.FirstSeq > 1 { 1496 return fs.Msgs == mb.Msgs && fs.FirstSeq == mb.FirstSeq && fs.LastSeq == mb.LastSeq && fs.Bytes == mb.Bytes 1497 } 1498 return fs.Msgs == mb.Msgs && fs.LastSeq == mb.LastSeq && fs.Bytes == mb.Bytes 1499 } 1500 1501 // recoverFullState will attempt to receover our last full state and re-process any state changes 1502 // that happened afterwards. 1503 func (fs *fileStore) recoverFullState() (rerr error) { 1504 fs.mu.Lock() 1505 defer fs.mu.Unlock() 1506 1507 // Check for any left over purged messages. 1508 <-dios 1509 pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) 1510 if _, err := os.Stat(pdir); err == nil { 1511 os.RemoveAll(pdir) 1512 } 1513 // Grab our stream state file and load it in. 1514 fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) 1515 buf, err := os.ReadFile(fn) 1516 dios <- struct{}{} 1517 1518 if err != nil { 1519 if !os.IsNotExist(err) { 1520 fs.warn("Could not read stream state file: %v", err) 1521 } 1522 return err 1523 } 1524 1525 const minLen = 32 1526 if len(buf) < minLen { 1527 os.Remove(fn) 1528 fs.warn("Stream state too short (%d bytes)", len(buf)) 1529 return errCorruptState 1530 } 1531 1532 // The highwayhash will be on the end. Check that it still matches. 1533 h := buf[len(buf)-highwayhash.Size64:] 1534 buf = buf[:len(buf)-highwayhash.Size64] 1535 fs.hh.Reset() 1536 fs.hh.Write(buf) 1537 if !bytes.Equal(h, fs.hh.Sum(nil)) { 1538 os.Remove(fn) 1539 fs.warn("Stream state checksum did not match") 1540 return errCorruptState 1541 } 1542 1543 // Decrypt if needed. 1544 if fs.prf != nil { 1545 // We can be setup for encryption but if this is a snapshot restore we will be missing the keyfile 1546 // since snapshots strip encryption. 1547 if err := fs.recoverAEK(); err == nil { 1548 ns := fs.aek.NonceSize() 1549 buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:], nil) 1550 if err != nil { 1551 fs.warn("Stream state error reading encryption key: %v", err) 1552 return err 1553 } 1554 } 1555 } 1556 1557 if buf[0] != fullStateMagic || buf[1] != fullStateVersion { 1558 os.Remove(fn) 1559 fs.warn("Stream state magic and version mismatch") 1560 return errCorruptState 1561 } 1562 1563 bi := hdrLen 1564 1565 readU64 := func() uint64 { 1566 if bi < 0 { 1567 return 0 1568 } 1569 v, n := binary.Uvarint(buf[bi:]) 1570 if n <= 0 { 1571 bi = -1 1572 return 0 1573 } 1574 bi += n 1575 return v 1576 } 1577 readI64 := func() int64 { 1578 if bi < 0 { 1579 return 0 1580 } 1581 v, n := binary.Varint(buf[bi:]) 1582 if n <= 0 { 1583 bi = -1 1584 return -1 1585 } 1586 bi += n 1587 return v 1588 } 1589 1590 setTime := func(t *time.Time, ts int64) { 1591 if ts == 0 { 1592 *t = time.Time{} 1593 } else { 1594 *t = time.Unix(0, ts).UTC() 1595 } 1596 } 1597 1598 var state StreamState 1599 state.Msgs = readU64() 1600 state.Bytes = readU64() 1601 state.FirstSeq = readU64() 1602 baseTime := readI64() 1603 setTime(&state.FirstTime, baseTime) 1604 state.LastSeq = readU64() 1605 setTime(&state.LastTime, readI64()) 1606 1607 // Check for per subject info. 1608 if numSubjects := int(readU64()); numSubjects > 0 { 1609 fs.psim, fs.tsl = fs.psim.Empty(), 0 1610 for i := 0; i < numSubjects; i++ { 1611 if lsubj := int(readU64()); lsubj > 0 { 1612 if bi+lsubj > len(buf) { 1613 os.Remove(fn) 1614 fs.warn("Stream state bad subject len (%d)", lsubj) 1615 return errCorruptState 1616 } 1617 // If we have lots of subjects this will alloc for each one. 1618 // We could reference the underlying buffer, but we could guess wrong if 1619 // number of blocks is large and subjects is low, since we would reference buf. 1620 subj := buf[bi : bi+lsubj] 1621 // We had a bug that could cause memory corruption in the PSIM that could have gotten stored to disk. 1622 // Only would affect subjects, so do quick check. 1623 if !isValidSubject(string(subj), true) { 1624 os.Remove(fn) 1625 fs.warn("Stream state corrupt subject detected") 1626 return errCorruptState 1627 } 1628 bi += lsubj 1629 psi := psi{total: readU64(), fblk: uint32(readU64())} 1630 if psi.total > 1 { 1631 psi.lblk = uint32(readU64()) 1632 } else { 1633 psi.lblk = psi.fblk 1634 } 1635 fs.psim.Insert(subj, psi) 1636 fs.tsl += lsubj 1637 } 1638 } 1639 } 1640 1641 // Track the state as represented by the blocks themselves. 1642 var mstate StreamState 1643 1644 if numBlocks := readU64(); numBlocks > 0 { 1645 lastIndex := int(numBlocks - 1) 1646 fs.blks = make([]*msgBlock, 0, numBlocks) 1647 for i := 0; i < int(numBlocks); i++ { 1648 index, nbytes, fseq, fts, lseq, lts, numDeleted := uint32(readU64()), readU64(), readU64(), readI64(), readU64(), readI64(), readU64() 1649 if bi < 0 { 1650 break 1651 } 1652 mb := fs.initMsgBlock(index) 1653 atomic.StoreUint64(&mb.first.seq, fseq) 1654 atomic.StoreUint64(&mb.last.seq, lseq) 1655 mb.msgs, mb.bytes = lseq-fseq+1, nbytes 1656 mb.first.ts, mb.last.ts = fts+baseTime, lts+baseTime 1657 if numDeleted > 0 { 1658 dmap, n, err := avl.Decode(buf[bi:]) 1659 if err != nil { 1660 os.Remove(fn) 1661 fs.warn("Stream state error decoding avl dmap: %v", err) 1662 return errCorruptState 1663 } 1664 mb.dmap = *dmap 1665 if mb.msgs > numDeleted { 1666 mb.msgs -= numDeleted 1667 } else { 1668 mb.msgs = 0 1669 } 1670 bi += n 1671 } 1672 // Only add in if not empty or the lmb. 1673 if mb.msgs > 0 || i == lastIndex { 1674 fs.addMsgBlock(mb) 1675 updateTrackingState(&mstate, mb) 1676 } else { 1677 // Mark dirty to cleanup. 1678 fs.dirty++ 1679 } 1680 } 1681 } 1682 1683 // Pull in last block index for the block that had last checksum when we wrote the full state. 1684 blkIndex := uint32(readU64()) 1685 var lchk [8]byte 1686 if bi+len(lchk) > len(buf) { 1687 bi = -1 1688 } else { 1689 copy(lchk[0:], buf[bi:bi+len(lchk)]) 1690 } 1691 1692 // Check if we had any errors. 1693 if bi < 0 { 1694 os.Remove(fn) 1695 fs.warn("Stream state has no checksum present") 1696 return errCorruptState 1697 } 1698 1699 // Move into place our state, msgBlks and subject info. 1700 fs.state = state 1701 1702 // First let's check the happy path, open the blk file that was the lmb when we created the full state. 1703 // See if we have the last block available. 1704 var matched bool 1705 mb := fs.lmb 1706 if mb == nil || mb.index != blkIndex { 1707 fs.warn("Stream state block does not exist or index mismatch") 1708 return errCorruptState 1709 } 1710 if _, err := os.Stat(mb.mfn); err != nil && os.IsNotExist(err) { 1711 // If our saved state is past what we see on disk, fallback and rebuild. 1712 if ld, _, _ := mb.rebuildState(); ld != nil { 1713 fs.addLostData(ld) 1714 } 1715 fs.warn("Stream state detected prior state, could not locate msg block %d", blkIndex) 1716 return errPriorState 1717 } 1718 if matched = bytes.Equal(mb.lastChecksum(), lchk[:]); !matched { 1719 // Remove the last message block since recover will add in the new one. 1720 fs.removeMsgBlockFromList(mb) 1721 // Reverse update of tracking state for this mb, will add new state in below. 1722 mstate.Msgs -= mb.msgs 1723 mstate.Bytes -= mb.bytes 1724 if nmb, err := fs.recoverMsgBlock(mb.index); err != nil && !os.IsNotExist(err) { 1725 fs.warn("Stream state could not recover last msg block") 1726 os.Remove(fn) 1727 return errCorruptState 1728 } else if nmb != nil { 1729 fs.adjustAccounting(mb, nmb) 1730 updateTrackingState(&mstate, nmb) 1731 } 1732 } 1733 1734 // On success double check our state. 1735 checkState := func() error { 1736 // We check first and last seq and number of msgs and bytes. If there is a difference, 1737 // return and error so we rebuild from the message block state on disk. 1738 if !trackingStatesEqual(&fs.state, &mstate) { 1739 fs.warn("Stream state encountered internal inconsistency on recover") 1740 os.Remove(fn) 1741 return errCorruptState 1742 } 1743 return nil 1744 } 1745 1746 // We may need to check other blocks. Even if we matched last checksum we will see if there is another block. 1747 for bi := blkIndex + 1; ; bi++ { 1748 nmb, err := fs.recoverMsgBlock(bi) 1749 if err != nil { 1750 if os.IsNotExist(err) { 1751 return checkState() 1752 } 1753 os.Remove(fn) 1754 fs.warn("Stream state could not recover msg block %d", bi) 1755 return err 1756 } 1757 if nmb != nil { 1758 // Update top level accounting 1759 if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { 1760 fs.state.FirstSeq = fseq 1761 fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC() 1762 } 1763 if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq { 1764 fs.state.LastSeq = lseq 1765 fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC() 1766 } 1767 fs.state.Msgs += nmb.msgs 1768 fs.state.Bytes += nmb.bytes 1769 updateTrackingState(&mstate, nmb) 1770 } 1771 } 1772 } 1773 1774 // adjustAccounting will be called when a stream state was only partially accounted for 1775 // within a message block, e.g. additional records were added after the stream state. 1776 // Lock should be held. 1777 func (fs *fileStore) adjustAccounting(mb, nmb *msgBlock) { 1778 nmb.mu.Lock() 1779 defer nmb.mu.Unlock() 1780 1781 // First make sure the new block is loaded. 1782 if nmb.cacheNotLoaded() { 1783 nmb.loadMsgsWithLock() 1784 } 1785 nmb.ensurePerSubjectInfoLoaded() 1786 1787 var smv StoreMsg 1788 1789 // Need to walk previous messages and undo psim stats. 1790 // We already undid msgs and bytes accounting. 1791 for seq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { 1792 // Lookup the message. If an error will be deleted, so can skip. 1793 sm, err := nmb.cacheLookup(seq, &smv) 1794 if err != nil { 1795 continue 1796 } 1797 if len(sm.subj) > 0 && fs.psim != nil { 1798 if info, ok := fs.psim.Find(stringToBytes(sm.subj)); ok { 1799 info.total-- 1800 } 1801 } 1802 } 1803 1804 // Walk only new messages and update accounting at fs level. Any messages that should have 1805 // triggered limits exceeded will be handled after the recovery and prior to the stream 1806 // being available to the system. 1807 for seq, lseq := atomic.LoadUint64(&mb.last.seq)+1, atomic.LoadUint64(&nmb.last.seq); seq <= lseq; seq++ { 1808 // Lookup the message. If an error will be deleted, so can skip. 1809 sm, err := nmb.cacheLookup(seq, &smv) 1810 if err != nil { 1811 continue 1812 } 1813 // Since we found it we just need to adjust fs totals and psim. 1814 fs.state.Msgs++ 1815 fs.state.Bytes += fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 1816 } 1817 1818 // Now check to see if we had a higher first for the recovered state mb vs nmb. 1819 if atomic.LoadUint64(&nmb.first.seq) < atomic.LoadUint64(&mb.first.seq) { 1820 // Now set first for nmb. 1821 atomic.StoreUint64(&nmb.first.seq, atomic.LoadUint64(&mb.first.seq)) 1822 } 1823 1824 // Update top level accounting. 1825 if fseq := atomic.LoadUint64(&nmb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { 1826 fs.state.FirstSeq = fseq 1827 fs.state.FirstTime = time.Unix(0, nmb.first.ts).UTC() 1828 } 1829 if lseq := atomic.LoadUint64(&nmb.last.seq); lseq > fs.state.LastSeq { 1830 fs.state.LastSeq = lseq 1831 fs.state.LastTime = time.Unix(0, nmb.last.ts).UTC() 1832 } 1833 } 1834 1835 // Grabs last checksum for the named block file. 1836 // Takes into account encryption etc. 1837 func (mb *msgBlock) lastChecksum() []byte { 1838 f, err := mb.openBlock() 1839 if err != nil { 1840 return nil 1841 } 1842 defer f.Close() 1843 1844 var lchk [8]byte 1845 if fi, _ := f.Stat(); fi != nil { 1846 mb.rbytes = uint64(fi.Size()) 1847 } 1848 if mb.rbytes < checksumSize { 1849 return nil 1850 } 1851 // Encrypted? 1852 // Check for encryption, we do not load keys on startup anymore so might need to load them here. 1853 if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) { 1854 if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil { 1855 return nil 1856 } 1857 } 1858 if mb.bek != nil { 1859 if buf, _ := mb.loadBlock(nil); len(buf) >= checksumSize { 1860 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 1861 if err != nil { 1862 return nil 1863 } 1864 mb.bek = bek 1865 mb.bek.XORKeyStream(buf, buf) 1866 copy(lchk[0:], buf[len(buf)-checksumSize:]) 1867 } 1868 } else { 1869 f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) 1870 } 1871 return lchk[:] 1872 } 1873 1874 // This will make sure we clean up old idx and fss files. 1875 func (fs *fileStore) cleanupOldMeta() { 1876 fs.mu.RLock() 1877 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 1878 fs.mu.RUnlock() 1879 1880 <-dios 1881 f, err := os.Open(mdir) 1882 dios <- struct{}{} 1883 if err != nil { 1884 return 1885 } 1886 1887 dirs, _ := f.ReadDir(-1) 1888 f.Close() 1889 1890 const ( 1891 minLen = 4 1892 idxSuffix = ".idx" 1893 fssSuffix = ".fss" 1894 ) 1895 for _, fi := range dirs { 1896 if name := fi.Name(); strings.HasSuffix(name, idxSuffix) || strings.HasSuffix(name, fssSuffix) { 1897 os.Remove(filepath.Join(mdir, name)) 1898 } 1899 } 1900 } 1901 1902 func (fs *fileStore) recoverMsgs() error { 1903 fs.mu.Lock() 1904 defer fs.mu.Unlock() 1905 1906 // Check for any left over purged messages. 1907 <-dios 1908 pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) 1909 if _, err := os.Stat(pdir); err == nil { 1910 os.RemoveAll(pdir) 1911 } 1912 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 1913 f, err := os.Open(mdir) 1914 if err != nil { 1915 dios <- struct{}{} 1916 return errNotReadable 1917 } 1918 dirs, err := f.ReadDir(-1) 1919 f.Close() 1920 dios <- struct{}{} 1921 1922 if err != nil { 1923 return errNotReadable 1924 } 1925 1926 indices := make(sort.IntSlice, 0, len(dirs)) 1927 var index int 1928 for _, fi := range dirs { 1929 if n, err := fmt.Sscanf(fi.Name(), blkScan, &index); err == nil && n == 1 { 1930 indices = append(indices, index) 1931 } 1932 } 1933 indices.Sort() 1934 1935 // Recover all of the msg blocks. 1936 // We now guarantee they are coming in order. 1937 for _, index := range indices { 1938 if mb, err := fs.recoverMsgBlock(uint32(index)); err == nil && mb != nil { 1939 // This is a truncate block with possibly no index. If the OS got shutdown 1940 // out from underneath of us this is possible. 1941 if mb.first.seq == 0 { 1942 mb.dirtyCloseWithRemove(true) 1943 fs.removeMsgBlockFromList(mb) 1944 continue 1945 } 1946 if fseq := atomic.LoadUint64(&mb.first.seq); fs.state.FirstSeq == 0 || fseq < fs.state.FirstSeq { 1947 fs.state.FirstSeq = fseq 1948 if mb.first.ts == 0 { 1949 fs.state.FirstTime = time.Time{} 1950 } else { 1951 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 1952 } 1953 } 1954 if lseq := atomic.LoadUint64(&mb.last.seq); lseq > fs.state.LastSeq { 1955 fs.state.LastSeq = lseq 1956 if mb.last.ts == 0 { 1957 fs.state.LastTime = time.Time{} 1958 } else { 1959 fs.state.LastTime = time.Unix(0, mb.last.ts).UTC() 1960 } 1961 } 1962 fs.state.Msgs += mb.msgs 1963 fs.state.Bytes += mb.bytes 1964 } else { 1965 return err 1966 } 1967 } 1968 1969 if len(fs.blks) > 0 { 1970 fs.lmb = fs.blks[len(fs.blks)-1] 1971 } else { 1972 _, err = fs.newMsgBlockForWrite() 1973 } 1974 1975 // Check if we encountered any lost data. 1976 if fs.ld != nil { 1977 var emptyBlks []*msgBlock 1978 for _, mb := range fs.blks { 1979 if mb.msgs == 0 && mb.rbytes == 0 { 1980 emptyBlks = append(emptyBlks, mb) 1981 } 1982 } 1983 for _, mb := range emptyBlks { 1984 // Need the mb lock here. 1985 mb.mu.Lock() 1986 fs.removeMsgBlock(mb) 1987 mb.mu.Unlock() 1988 } 1989 } 1990 1991 if err != nil { 1992 return err 1993 } 1994 1995 // Check for keyfiles orphans. 1996 if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 { 1997 valid := make(map[uint32]bool) 1998 for _, mb := range fs.blks { 1999 valid[mb.index] = true 2000 } 2001 for _, fn := range kms { 2002 var index uint32 2003 shouldRemove := true 2004 if n, err := fmt.Sscanf(filepath.Base(fn), keyScan, &index); err == nil && n == 1 && valid[index] { 2005 shouldRemove = false 2006 } 2007 if shouldRemove { 2008 os.Remove(fn) 2009 } 2010 } 2011 } 2012 2013 return nil 2014 } 2015 2016 // Will expire msgs that have aged out on restart. 2017 // We will treat this differently in case we have a recovery 2018 // that will expire alot of messages on startup. 2019 // Should only be called on startup. 2020 func (fs *fileStore) expireMsgsOnRecover() { 2021 if fs.state.Msgs == 0 { 2022 return 2023 } 2024 2025 var minAge = time.Now().UnixNano() - int64(fs.cfg.MaxAge) 2026 var purged, bytes uint64 2027 var deleted int 2028 var nts int64 2029 2030 // If we expire all make sure to write out a tombstone. Need to be done by hand here, 2031 // usually taken care of by fs.removeMsgBlock() but we do not call that here. 2032 var last msgId 2033 2034 deleteEmptyBlock := func(mb *msgBlock) { 2035 // If we are the last keep state to remember first/last sequence. 2036 // Do this part by hand since not deleting one by one. 2037 if mb == fs.lmb { 2038 last.seq = atomic.LoadUint64(&mb.last.seq) 2039 last.ts = mb.last.ts 2040 } 2041 // Make sure we do subject cleanup as well. 2042 mb.ensurePerSubjectInfoLoaded() 2043 for subj, ss := range mb.fss { 2044 for i := uint64(0); i < ss.Msgs; i++ { 2045 fs.removePerSubject(subj) 2046 } 2047 } 2048 mb.dirtyCloseWithRemove(true) 2049 deleted++ 2050 } 2051 2052 for _, mb := range fs.blks { 2053 mb.mu.Lock() 2054 if minAge < mb.first.ts { 2055 nts = mb.first.ts 2056 mb.mu.Unlock() 2057 break 2058 } 2059 // Can we remove whole block here? 2060 if mb.last.ts <= minAge { 2061 purged += mb.msgs 2062 bytes += mb.bytes 2063 deleteEmptyBlock(mb) 2064 mb.mu.Unlock() 2065 continue 2066 } 2067 2068 // If we are here we have to process the interior messages of this blk. 2069 // This will load fss as well. 2070 if err := mb.loadMsgsWithLock(); err != nil { 2071 mb.mu.Unlock() 2072 break 2073 } 2074 2075 var smv StoreMsg 2076 var needNextFirst bool 2077 2078 // Walk messages and remove if expired. 2079 fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 2080 for seq := fseq; seq <= lseq; seq++ { 2081 sm, err := mb.cacheLookup(seq, &smv) 2082 // Process interior deleted msgs. 2083 if err == errDeletedMsg { 2084 // Update dmap. 2085 if mb.dmap.Exists(seq) { 2086 mb.dmap.Delete(seq) 2087 } 2088 // Keep this updated just in case since we are removing dmap entries. 2089 atomic.StoreUint64(&mb.first.seq, seq) 2090 needNextFirst = true 2091 continue 2092 } 2093 // Break on other errors. 2094 if err != nil || sm == nil { 2095 atomic.StoreUint64(&mb.first.seq, seq) 2096 needNextFirst = true 2097 break 2098 } 2099 2100 // No error and sm != nil from here onward. 2101 2102 // Check for done. 2103 if minAge < sm.ts { 2104 atomic.StoreUint64(&mb.first.seq, sm.seq) 2105 mb.first.ts = sm.ts 2106 needNextFirst = false 2107 nts = sm.ts 2108 break 2109 } 2110 2111 // Delete the message here. 2112 if mb.msgs > 0 { 2113 atomic.StoreUint64(&mb.first.seq, seq) 2114 needNextFirst = true 2115 sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 2116 if sz > mb.bytes { 2117 sz = mb.bytes 2118 } 2119 mb.bytes -= sz 2120 bytes += sz 2121 mb.msgs-- 2122 purged++ 2123 } 2124 // Update fss 2125 // Make sure we have fss loaded. 2126 mb.removeSeqPerSubject(sm.subj, seq) 2127 fs.removePerSubject(sm.subj) 2128 } 2129 // Make sure we have a proper next first sequence. 2130 if needNextFirst { 2131 mb.selectNextFirst() 2132 } 2133 // Check if empty after processing, could happen if tail of messages are all deleted. 2134 if mb.msgs == 0 { 2135 deleteEmptyBlock(mb) 2136 } 2137 mb.mu.Unlock() 2138 break 2139 } 2140 2141 if nts > 0 { 2142 // Make sure to set age check based on this value. 2143 fs.resetAgeChk(nts - minAge) 2144 } 2145 2146 if deleted > 0 { 2147 // Update block map. 2148 if fs.bim != nil { 2149 for _, mb := range fs.blks[:deleted] { 2150 delete(fs.bim, mb.index) 2151 } 2152 } 2153 // Update blks slice. 2154 fs.blks = copyMsgBlocks(fs.blks[deleted:]) 2155 if lb := len(fs.blks); lb == 0 { 2156 fs.lmb = nil 2157 } else { 2158 fs.lmb = fs.blks[lb-1] 2159 } 2160 } 2161 // Update top level accounting. 2162 if purged < fs.state.Msgs { 2163 fs.state.Msgs -= purged 2164 } else { 2165 fs.state.Msgs = 0 2166 } 2167 if bytes < fs.state.Bytes { 2168 fs.state.Bytes -= bytes 2169 } else { 2170 fs.state.Bytes = 0 2171 } 2172 // Make sure to we properly set the fs first sequence and timestamp. 2173 fs.selectNextFirst() 2174 2175 // Check if we have no messages and blocks left. 2176 if fs.lmb == nil && last.seq != 0 { 2177 if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { 2178 lmb.writeTombstone(last.seq, last.ts) 2179 } 2180 // Clear any global subject state. 2181 fs.psim, fs.tsl = fs.psim.Empty(), 0 2182 } 2183 2184 // If we purged anything, make sure we kick flush state loop. 2185 if purged > 0 { 2186 fs.dirty++ 2187 } 2188 } 2189 2190 func copyMsgBlocks(src []*msgBlock) []*msgBlock { 2191 if src == nil { 2192 return nil 2193 } 2194 dst := make([]*msgBlock, len(src)) 2195 copy(dst, src) 2196 return dst 2197 } 2198 2199 // GetSeqFromTime looks for the first sequence number that has 2200 // the message with >= timestamp. 2201 // FIXME(dlc) - inefficient, and dumb really. Make this better. 2202 func (fs *fileStore) GetSeqFromTime(t time.Time) uint64 { 2203 fs.mu.RLock() 2204 lastSeq := fs.state.LastSeq 2205 closed := fs.closed 2206 fs.mu.RUnlock() 2207 2208 if closed { 2209 return 0 2210 } 2211 2212 mb := fs.selectMsgBlockForStart(t) 2213 if mb == nil { 2214 return lastSeq + 1 2215 } 2216 2217 fseq := atomic.LoadUint64(&mb.first.seq) 2218 lseq := atomic.LoadUint64(&mb.last.seq) 2219 2220 var smv StoreMsg 2221 2222 // Linear search, hence the dumb part.. 2223 ts := t.UnixNano() 2224 for seq := fseq; seq <= lseq; seq++ { 2225 sm, _, _ := mb.fetchMsg(seq, &smv) 2226 if sm != nil && sm.ts >= ts { 2227 return sm.seq 2228 } 2229 } 2230 return 0 2231 } 2232 2233 // Find the first matching message against a sublist. 2234 func (mb *msgBlock) firstMatchingMulti(sl *Sublist, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) { 2235 mb.mu.Lock() 2236 defer mb.mu.Unlock() 2237 2238 // Will just do linear walk for now. 2239 // TODO(dlc) - Be better at skipping blocks that will not match us regardless. 2240 2241 var didLoad bool 2242 // Need messages loaded from here on out. 2243 if mb.cacheNotLoaded() { 2244 if err := mb.loadMsgsWithLock(); err != nil { 2245 return nil, false, err 2246 } 2247 didLoad = true 2248 } 2249 2250 // Make sure to start at mb.first.seq if fseq < mb.first.seq 2251 if seq := atomic.LoadUint64(&mb.first.seq); seq > start { 2252 start = seq 2253 } 2254 lseq := atomic.LoadUint64(&mb.last.seq) 2255 2256 if sm == nil { 2257 sm = new(StoreMsg) 2258 } 2259 2260 var result SublistResult 2261 for seq := start; seq <= lseq; seq++ { 2262 llseq := mb.llseq 2263 fsm, err := mb.cacheLookup(seq, sm) 2264 if err != nil { 2265 continue 2266 } 2267 expireOk := seq == lseq && mb.llseq == seq 2268 2269 if r := sl.MatchWithResult(fsm.subj, &result); len(r.psubs) > 0 { 2270 return fsm, expireOk, nil 2271 } 2272 // If we are here we did not match, so put the llseq back. 2273 mb.llseq = llseq 2274 } 2275 return nil, didLoad, ErrStoreMsgNotFound 2276 } 2277 2278 // Find the first matching message. 2279 func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) { 2280 mb.mu.Lock() 2281 defer mb.mu.Unlock() 2282 2283 fseq, isAll, subs := start, filter == _EMPTY_ || filter == fwcs, []string{filter} 2284 2285 var didLoad bool 2286 if mb.fssNotLoaded() { 2287 // Make sure we have fss loaded. 2288 mb.loadMsgsWithLock() 2289 didLoad = true 2290 } 2291 2292 // If we only have 1 subject currently and it matches our filter we can also set isAll. 2293 if !isAll && len(mb.fss) == 1 { 2294 _, isAll = mb.fss[filter] 2295 } 2296 // Make sure to start at mb.first.seq if fseq < mb.first.seq 2297 if seq := atomic.LoadUint64(&mb.first.seq); seq > fseq { 2298 fseq = seq 2299 } 2300 lseq := atomic.LoadUint64(&mb.last.seq) 2301 2302 // Optionally build the isMatch for wildcard filters. 2303 tsa := [32]string{} 2304 fsa := [32]string{} 2305 var fts []string 2306 var isMatch func(subj string) bool 2307 // Decide to build. 2308 if wc { 2309 fts = tokenizeSubjectIntoSlice(fsa[:0], filter) 2310 isMatch = func(subj string) bool { 2311 tts := tokenizeSubjectIntoSlice(tsa[:0], subj) 2312 return isSubsetMatchTokenized(tts, fts) 2313 } 2314 } 2315 // Only do linear scan if isAll or we are wildcarded and have to traverse more fss than actual messages. 2316 doLinearScan := isAll || (wc && len(mb.fss) > int(lseq-fseq)) 2317 2318 if !doLinearScan { 2319 // If we have a wildcard match against all tracked subjects we know about. 2320 if wc { 2321 subs = subs[:0] 2322 for subj := range mb.fss { 2323 if isMatch(subj) { 2324 subs = append(subs, subj) 2325 } 2326 } 2327 // Check if we matched anything 2328 if len(subs) == 0 { 2329 return nil, didLoad, ErrStoreMsgNotFound 2330 } 2331 } 2332 fseq = lseq + 1 2333 for _, subj := range subs { 2334 ss := mb.fss[subj] 2335 if ss != nil && ss.firstNeedsUpdate { 2336 mb.recalculateFirstForSubj(subj, ss.First, ss) 2337 } 2338 if ss == nil || start > ss.Last || ss.First >= fseq { 2339 continue 2340 } 2341 if ss.First < start { 2342 fseq = start 2343 } else { 2344 fseq = ss.First 2345 } 2346 } 2347 } 2348 2349 if fseq > lseq { 2350 return nil, didLoad, ErrStoreMsgNotFound 2351 } 2352 2353 // If we guess to not do a linear scan, but the above resulted in alot of subs that will 2354 // need to be checked for every scanned message, revert. 2355 // TODO(dlc) - we could memoize the subs across calls. 2356 if len(subs) > int(lseq-fseq) { 2357 doLinearScan = true 2358 } 2359 2360 // Need messages loaded from here on out. 2361 if mb.cacheNotLoaded() { 2362 if err := mb.loadMsgsWithLock(); err != nil { 2363 return nil, false, err 2364 } 2365 didLoad = true 2366 } 2367 2368 if sm == nil { 2369 sm = new(StoreMsg) 2370 } 2371 2372 for seq := fseq; seq <= lseq; seq++ { 2373 llseq := mb.llseq 2374 fsm, err := mb.cacheLookup(seq, sm) 2375 if err != nil { 2376 if err == errPartialCache || err == errNoCache { 2377 return nil, false, err 2378 } 2379 continue 2380 } 2381 expireOk := seq == lseq && mb.llseq == seq 2382 if isAll { 2383 return fsm, expireOk, nil 2384 } 2385 if doLinearScan { 2386 if wc && isMatch(sm.subj) { 2387 return fsm, expireOk, nil 2388 } else if !wc && fsm.subj == filter { 2389 return fsm, expireOk, nil 2390 } 2391 } else { 2392 for _, subj := range subs { 2393 if fsm.subj == subj { 2394 return fsm, expireOk, nil 2395 } 2396 } 2397 } 2398 // If we are here we did not match, so put the llseq back. 2399 mb.llseq = llseq 2400 } 2401 2402 return nil, didLoad, ErrStoreMsgNotFound 2403 } 2404 2405 // This will traverse a message block and generate the filtered pending. 2406 func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) { 2407 mb.mu.Lock() 2408 defer mb.mu.Unlock() 2409 return mb.filteredPendingLocked(subj, wc, seq) 2410 } 2411 2412 // This will traverse a message block and generate the filtered pending. 2413 // Lock should be held. 2414 func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) { 2415 isAll := filter == _EMPTY_ || filter == fwcs 2416 2417 // First check if we can optimize this part. 2418 // This means we want all and the starting sequence was before this block. 2419 if isAll { 2420 if fseq := atomic.LoadUint64(&mb.first.seq); sseq <= fseq { 2421 return mb.msgs, fseq, atomic.LoadUint64(&mb.last.seq) 2422 } 2423 } 2424 2425 update := func(ss *SimpleState) { 2426 total += ss.Msgs 2427 if first == 0 || ss.First < first { 2428 first = ss.First 2429 } 2430 if ss.Last > last { 2431 last = ss.Last 2432 } 2433 } 2434 2435 // Make sure we have fss loaded. 2436 mb.ensurePerSubjectInfoLoaded() 2437 2438 tsa := [32]string{} 2439 fsa := [32]string{} 2440 fts := tokenizeSubjectIntoSlice(fsa[:0], filter) 2441 2442 // 1. See if we match any subs from fss. 2443 // 2. If we match and the sseq is past ss.Last then we can use meta only. 2444 // 3. If we match and we need to do a partial, break and clear any totals and do a full scan like num pending. 2445 2446 isMatch := func(subj string) bool { 2447 if !wc { 2448 return subj == filter 2449 } 2450 tts := tokenizeSubjectIntoSlice(tsa[:0], subj) 2451 return isSubsetMatchTokenized(tts, fts) 2452 } 2453 2454 var havePartial bool 2455 for subj, ss := range mb.fss { 2456 if isAll || isMatch(subj) { 2457 if ss.firstNeedsUpdate { 2458 mb.recalculateFirstForSubj(subj, ss.First, ss) 2459 } 2460 if sseq <= ss.First { 2461 update(ss) 2462 } else if sseq <= ss.Last { 2463 // We matched but its a partial. 2464 havePartial = true 2465 break 2466 } 2467 } 2468 } 2469 2470 // If we did not encounter any partials we can return here. 2471 if !havePartial { 2472 return total, first, last 2473 } 2474 2475 // If we are here we need to scan the msgs. 2476 // Clear what we had. 2477 total, first, last = 0, 0, 0 2478 2479 // If we load the cache for a linear scan we want to expire that cache upon exit. 2480 var shouldExpire bool 2481 if mb.cacheNotLoaded() { 2482 mb.loadMsgsWithLock() 2483 shouldExpire = true 2484 } 2485 2486 var smv StoreMsg 2487 for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { 2488 sm, _ := mb.cacheLookup(seq, &smv) 2489 if sm == nil { 2490 continue 2491 } 2492 if isAll || isMatch(sm.subj) { 2493 total++ 2494 if first == 0 || seq < first { 2495 first = seq 2496 } 2497 if seq > last { 2498 last = seq 2499 } 2500 } 2501 } 2502 // If we loaded this block for this operation go ahead and expire it here. 2503 if shouldExpire { 2504 mb.tryForceExpireCacheLocked() 2505 } 2506 2507 return total, first, last 2508 } 2509 2510 // FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence. 2511 func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState { 2512 fs.mu.RLock() 2513 defer fs.mu.RUnlock() 2514 2515 lseq := fs.state.LastSeq 2516 if sseq < fs.state.FirstSeq { 2517 sseq = fs.state.FirstSeq 2518 } 2519 2520 // Returned state. 2521 var ss SimpleState 2522 2523 // If past the end no results. 2524 if sseq > lseq { 2525 // Make sure we track sequences 2526 ss.First = fs.state.FirstSeq 2527 ss.Last = fs.state.LastSeq 2528 return ss 2529 } 2530 2531 // If we want all msgs that match we can shortcircuit. 2532 // TODO(dlc) - This can be extended for all cases but would 2533 // need to be careful on total msgs calculations etc. 2534 if sseq == fs.state.FirstSeq { 2535 fs.numFilteredPending(subj, &ss) 2536 } else { 2537 wc := subjectHasWildcard(subj) 2538 // Tracking subject state. 2539 // TODO(dlc) - Optimize for 2.10 with avl tree and no atomics per block. 2540 for _, mb := range fs.blks { 2541 // Skip blocks that are less than our starting sequence. 2542 if sseq > atomic.LoadUint64(&mb.last.seq) { 2543 continue 2544 } 2545 t, f, l := mb.filteredPending(subj, wc, sseq) 2546 ss.Msgs += t 2547 if ss.First == 0 || (f > 0 && f < ss.First) { 2548 ss.First = f 2549 } 2550 if l > ss.Last { 2551 ss.Last = l 2552 } 2553 } 2554 } 2555 2556 return ss 2557 } 2558 2559 // Optimized way for getting all num pending matching a filter subject. 2560 // Lock should be held. 2561 func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) { 2562 isAll := filter == _EMPTY_ || filter == fwcs 2563 2564 // If isAll we do not need to do anything special to calculate the first and last and total. 2565 if isAll { 2566 ss.First = fs.state.FirstSeq 2567 ss.Last = fs.state.LastSeq 2568 ss.Msgs = fs.state.Msgs 2569 return 2570 } 2571 2572 start, stop := uint32(math.MaxUint32), uint32(0) 2573 fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) { 2574 ss.Msgs += psi.total 2575 // Keep track of start and stop indexes for this subject. 2576 if psi.fblk < start { 2577 start = psi.fblk 2578 } 2579 if psi.lblk > stop { 2580 stop = psi.lblk 2581 } 2582 }) 2583 // We do need to figure out the first and last sequences. 2584 wc := subjectHasWildcard(filter) 2585 // Do start 2586 mb := fs.bim[start] 2587 if mb != nil { 2588 _, f, _ := mb.filteredPending(filter, wc, 0) 2589 ss.First = f 2590 } 2591 if ss.First == 0 { 2592 // This is a miss. This can happen since psi.fblk is lazy, but should be very rare. 2593 for i := start + 1; i <= stop; i++ { 2594 mb := fs.bim[i] 2595 if mb == nil { 2596 continue 2597 } 2598 if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 { 2599 ss.First = f 2600 break 2601 } 2602 } 2603 } 2604 // Now last 2605 if mb = fs.bim[stop]; mb != nil { 2606 _, _, l := mb.filteredPending(filter, wc, 0) 2607 ss.Last = l 2608 } 2609 } 2610 2611 // SubjectsState returns a map of SimpleState for all matching subjects. 2612 func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState { 2613 fs.mu.RLock() 2614 defer fs.mu.RUnlock() 2615 2616 if fs.state.Msgs == 0 || fs.noTrackSubjects() { 2617 return nil 2618 } 2619 2620 start, stop := fs.blks[0], fs.lmb 2621 // We can short circuit if not a wildcard using psim for start and stop. 2622 if !subjectHasWildcard(subject) { 2623 info, ok := fs.psim.Find(stringToBytes(subject)) 2624 if !ok { 2625 return nil 2626 } 2627 start, stop = fs.bim[info.fblk], fs.bim[info.lblk] 2628 } 2629 2630 // Aggregate fss. 2631 fss := make(map[string]SimpleState) 2632 var startFound bool 2633 2634 for _, mb := range fs.blks { 2635 if !startFound { 2636 if mb != start { 2637 continue 2638 } 2639 startFound = true 2640 } 2641 2642 mb.mu.Lock() 2643 var shouldExpire bool 2644 if mb.fssNotLoaded() { 2645 // Make sure we have fss loaded. 2646 mb.loadMsgsWithLock() 2647 shouldExpire = true 2648 } 2649 for subj, ss := range mb.fss { 2650 if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) { 2651 if ss.firstNeedsUpdate { 2652 mb.recalculateFirstForSubj(subj, ss.First, ss) 2653 } 2654 oss := fss[subj] 2655 if oss.First == 0 { // New 2656 fss[subj] = *ss 2657 } else { 2658 // Merge here. 2659 oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs 2660 fss[subj] = oss 2661 } 2662 } 2663 } 2664 if shouldExpire { 2665 // Expire this cache before moving on. 2666 mb.tryForceExpireCacheLocked() 2667 } 2668 mb.mu.Unlock() 2669 2670 if mb == stop { 2671 break 2672 } 2673 } 2674 2675 return fss 2676 } 2677 2678 // MultiLastSeqs will return a sorted list of sequences that match all subjects presented in filters. 2679 // We will not exceed the maxSeq, which if 0 becomes the store's last sequence. 2680 func (fs *fileStore) MultiLastSeqs(filters []string, maxSeq uint64, maxAllowed int) ([]uint64, error) { 2681 fs.mu.RLock() 2682 defer fs.mu.RUnlock() 2683 2684 if fs.state.Msgs == 0 || fs.noTrackSubjects() { 2685 return nil, nil 2686 } 2687 2688 lastBlkIndex := len(fs.blks) - 1 2689 lastMB := fs.blks[lastBlkIndex] 2690 2691 // Implied last sequence. 2692 if maxSeq == 0 { 2693 maxSeq = fs.state.LastSeq 2694 } else { 2695 // Udate last mb index if not last seq. 2696 lastBlkIndex, lastMB = fs.selectMsgBlockWithIndex(maxSeq) 2697 } 2698 //Make sure non-nil 2699 if lastMB == nil { 2700 return nil, nil 2701 } 2702 2703 // Grab our last mb index (not same as blk index). 2704 lastMB.mu.RLock() 2705 lastMBIndex := lastMB.index 2706 lastMB.mu.RUnlock() 2707 2708 subs := make(map[string]*psi) 2709 ltSeen := make(map[string]uint32) 2710 for _, filter := range filters { 2711 fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) { 2712 s := string(subj) 2713 subs[s] = psi 2714 if psi.lblk < lastMBIndex { 2715 ltSeen[s] = psi.lblk 2716 } 2717 }) 2718 } 2719 2720 // If all subjects have a lower last index, select the largest for our walk backwards. 2721 if len(ltSeen) == len(subs) { 2722 max := uint32(0) 2723 for _, mbi := range ltSeen { 2724 if mbi > max { 2725 max = mbi 2726 } 2727 } 2728 lastMB = fs.bim[max] 2729 } 2730 2731 // Collect all sequences needed. 2732 seqs := make([]uint64, 0, len(subs)) 2733 for i, lnf := lastBlkIndex, false; i >= 0; i-- { 2734 if len(subs) == 0 { 2735 break 2736 } 2737 mb := fs.blks[i] 2738 if !lnf { 2739 if mb != lastMB { 2740 continue 2741 } 2742 lnf = true 2743 } 2744 // We can start properly looking here. 2745 mb.mu.Lock() 2746 mb.ensurePerSubjectInfoLoaded() 2747 for subj, psi := range subs { 2748 if ss := mb.fss[subj]; ss != nil { 2749 if ss.Last <= maxSeq { 2750 seqs = append(seqs, ss.Last) 2751 delete(subs, subj) 2752 } else { 2753 // Need to search for it since last is > maxSeq. 2754 if mb.cacheNotLoaded() { 2755 mb.loadMsgsWithLock() 2756 } 2757 var smv StoreMsg 2758 fseq := atomic.LoadUint64(&mb.first.seq) 2759 for seq := maxSeq; seq >= fseq; seq-- { 2760 sm, _ := mb.cacheLookup(seq, &smv) 2761 if sm == nil || sm.subj != subj { 2762 continue 2763 } 2764 seqs = append(seqs, sm.seq) 2765 delete(subs, subj) 2766 break 2767 } 2768 } 2769 } else if mb.index <= psi.fblk { 2770 // Track which subs are no longer applicable, meaning we will not find a valid msg at this point. 2771 delete(subs, subj) 2772 } 2773 // TODO(dlc) we could track lblk like above in case some subs are very far apart. 2774 // Not too bad if fss loaded since we will skip over quickly with it loaded, but might be worth it. 2775 } 2776 mb.mu.Unlock() 2777 2778 // If maxAllowed was sepcified check that we will not exceed that. 2779 if maxAllowed > 0 && len(seqs) > maxAllowed { 2780 return nil, ErrTooManyResults 2781 } 2782 2783 } 2784 if len(seqs) == 0 { 2785 return nil, nil 2786 } 2787 sort.Slice(seqs, func(i, j int) bool { return seqs[i] < seqs[j] }) 2788 return seqs, nil 2789 } 2790 2791 // NumPending will return the number of pending messages matching the filter subject starting at sequence. 2792 // Optimized for stream num pending calculations for consumers. 2793 func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) (total, validThrough uint64) { 2794 fs.mu.RLock() 2795 defer fs.mu.RUnlock() 2796 2797 // This can always be last for these purposes. 2798 validThrough = fs.state.LastSeq 2799 2800 if fs.state.Msgs == 0 || sseq > fs.state.LastSeq { 2801 return 0, validThrough 2802 } 2803 2804 // Track starting for both block for the sseq and staring block that matches any subject. 2805 var seqStart int 2806 // See if we need to figure out starting block per sseq. 2807 if sseq > fs.state.FirstSeq { 2808 // This should not, but can return -1, so make sure we check to avoid panic below. 2809 if seqStart, _ = fs.selectMsgBlockWithIndex(sseq); seqStart < 0 { 2810 seqStart = 0 2811 } 2812 } 2813 2814 isAll := filter == _EMPTY_ || filter == fwcs 2815 wc := subjectHasWildcard(filter) 2816 2817 // See if filter was provided but its the only subject. 2818 if !isAll && !wc && fs.psim.Size() == 1 { 2819 if _, ok := fs.psim.Find(stringToBytes(filter)); ok { 2820 isAll = true 2821 } 2822 } 2823 if isAll && filter == _EMPTY_ { 2824 filter = fwcs 2825 } 2826 // If we are isAll and have no deleted we can do a simpler calculation. 2827 if !lastPerSubject && isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs { 2828 if sseq == 0 { 2829 return fs.state.Msgs, validThrough 2830 } 2831 return fs.state.LastSeq - sseq + 1, validThrough 2832 } 2833 2834 var tsa, fsa [32]string 2835 fts := tokenizeSubjectIntoSlice(fsa[:0], filter) 2836 2837 isMatch := func(subj string) bool { 2838 if isAll { 2839 return true 2840 } 2841 if !wc { 2842 return subj == filter 2843 } 2844 tts := tokenizeSubjectIntoSlice(tsa[:0], subj) 2845 return isSubsetMatchTokenized(tts, fts) 2846 } 2847 2848 // Handle last by subject a bit differently. 2849 // We will scan PSIM since we accurately track the last block we have seen the subject in. This 2850 // allows us to only need to load at most one block now. 2851 // For the last block, we need to track the subjects that we know are in that block, and track seen 2852 // while in the block itself, but complexity there worth it. 2853 if lastPerSubject { 2854 // If we want all and our start sequence is equal or less than first return number of subjects. 2855 if isAll && sseq <= fs.state.FirstSeq { 2856 return uint64(fs.psim.Size()), validThrough 2857 } 2858 // If we are here we need to scan. We are going to scan the PSIM looking for lblks that are >= seqStart. 2859 // This will build up a list of all subjects from the selected block onward. 2860 lbm := make(map[string]bool) 2861 mb := fs.blks[seqStart] 2862 bi := mb.index 2863 2864 fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) { 2865 // If the select blk start is greater than entry's last blk skip. 2866 if bi > psi.lblk { 2867 return 2868 } 2869 total++ 2870 // We will track the subjects that are an exact match to the last block. 2871 // This is needed for last block processing. 2872 if psi.lblk == bi { 2873 lbm[string(subj)] = true 2874 } 2875 }) 2876 2877 // Now check if we need to inspect the seqStart block. 2878 // Grab write lock in case we need to load in msgs. 2879 mb.mu.Lock() 2880 var shouldExpire bool 2881 // We need to walk this block to correct accounting from above. 2882 if sseq > mb.first.seq { 2883 // Track the ones we add back in case more than one. 2884 seen := make(map[string]bool) 2885 // We need to discount the total by subjects seen before sseq, but also add them right back in if they are >= sseq for this blk. 2886 // This only should be subjects we know have the last blk in this block. 2887 if mb.cacheNotLoaded() { 2888 mb.loadMsgsWithLock() 2889 shouldExpire = true 2890 } 2891 var smv StoreMsg 2892 for seq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { 2893 sm, _ := mb.cacheLookup(seq, &smv) 2894 if sm == nil || sm.subj == _EMPTY_ || !lbm[sm.subj] { 2895 continue 2896 } 2897 if isMatch(sm.subj) { 2898 // If less than sseq adjust off of total as long as this subject matched the last block. 2899 if seq < sseq { 2900 if !seen[sm.subj] { 2901 total-- 2902 seen[sm.subj] = true 2903 } 2904 } else if seen[sm.subj] { 2905 // This is equal or more than sseq, so add back in. 2906 total++ 2907 // Make sure to not process anymore. 2908 delete(seen, sm.subj) 2909 } 2910 } 2911 } 2912 } 2913 // If we loaded the block try to force expire. 2914 if shouldExpire { 2915 mb.tryForceExpireCacheLocked() 2916 } 2917 mb.mu.Unlock() 2918 return total, validThrough 2919 } 2920 2921 // If we would need to scan more from the beginning, revert back to calculating directly here. 2922 // TODO(dlc) - Redo properly with sublists etc for subject-based filtering. 2923 if seqStart >= (len(fs.blks) / 2) { 2924 for i := seqStart; i < len(fs.blks); i++ { 2925 var shouldExpire bool 2926 mb := fs.blks[i] 2927 // Hold write lock in case we need to load cache. 2928 mb.mu.Lock() 2929 var t uint64 2930 if isAll && sseq <= atomic.LoadUint64(&mb.first.seq) { 2931 total += mb.msgs 2932 mb.mu.Unlock() 2933 continue 2934 } 2935 // If we are here we need to at least scan the subject fss. 2936 // Make sure we have fss loaded. 2937 if mb.fssNotLoaded() { 2938 mb.loadMsgsWithLock() 2939 shouldExpire = true 2940 } 2941 var havePartial bool 2942 for subj, ss := range mb.fss { 2943 if isMatch(subj) { 2944 if ss.firstNeedsUpdate { 2945 mb.recalculateFirstForSubj(subj, ss.First, ss) 2946 } 2947 if sseq <= ss.First { 2948 t += ss.Msgs 2949 } else if sseq <= ss.Last { 2950 // We matched but its a partial. 2951 havePartial = true 2952 break 2953 } 2954 } 2955 } 2956 // See if we need to scan msgs here. 2957 if havePartial { 2958 // Make sure we have the cache loaded. 2959 if mb.cacheNotLoaded() { 2960 mb.loadMsgsWithLock() 2961 shouldExpire = true 2962 } 2963 // Clear on partial. 2964 t = 0 2965 var smv StoreMsg 2966 for seq, lseq := sseq, atomic.LoadUint64(&mb.last.seq); seq <= lseq; seq++ { 2967 if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && isMatch(sm.subj) { 2968 t++ 2969 } 2970 } 2971 } 2972 // If we loaded this block for this operation go ahead and expire it here. 2973 if shouldExpire { 2974 mb.tryForceExpireCacheLocked() 2975 } 2976 mb.mu.Unlock() 2977 total += t 2978 } 2979 return total, validThrough 2980 } 2981 2982 // If we are here it's better to calculate totals from psim and adjust downward by scanning less blocks. 2983 // TODO(dlc) - Eventually when sublist uses generics, make this sublist driven instead. 2984 start := uint32(math.MaxUint32) 2985 fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) { 2986 total += psi.total 2987 // Keep track of start index for this subject. 2988 if psi.fblk < start { 2989 start = psi.fblk 2990 } 2991 }) 2992 // See if we were asked for all, if so we are done. 2993 if sseq <= fs.state.FirstSeq { 2994 return total, validThrough 2995 } 2996 2997 // If we are here we need to calculate partials for the first blocks. 2998 firstSubjBlk := fs.bim[start] 2999 var firstSubjBlkFound bool 3000 // Adjust in case not found. 3001 if firstSubjBlk == nil { 3002 firstSubjBlkFound = true 3003 } 3004 3005 // Track how many we need to adjust against the total. 3006 var adjust uint64 3007 for i := 0; i <= seqStart; i++ { 3008 mb := fs.blks[i] 3009 // We can skip blks if we know they are below the first one that has any subject matches. 3010 if !firstSubjBlkFound { 3011 if firstSubjBlkFound = (mb == firstSubjBlk); !firstSubjBlkFound { 3012 continue 3013 } 3014 } 3015 // We need to scan this block. 3016 var shouldExpire bool 3017 mb.mu.Lock() 3018 // Check if we should include all of this block in adjusting. If so work with metadata. 3019 if sseq > atomic.LoadUint64(&mb.last.seq) { 3020 if isAll { 3021 adjust += mb.msgs 3022 } else { 3023 // We need to adjust for all matches in this block. 3024 // Make sure we have fss loaded. This loads whole block now. 3025 if mb.fssNotLoaded() { 3026 mb.loadMsgsWithLock() 3027 shouldExpire = true 3028 } 3029 for subj, ss := range mb.fss { 3030 if isMatch(subj) { 3031 adjust += ss.Msgs 3032 } 3033 } 3034 } 3035 } else { 3036 // This is the last block. We need to scan per message here. 3037 if mb.cacheNotLoaded() { 3038 mb.loadMsgsWithLock() 3039 shouldExpire = true 3040 } 3041 var last = atomic.LoadUint64(&mb.last.seq) 3042 if sseq < last { 3043 last = sseq 3044 } 3045 // We need to walk all messages in this block 3046 var smv StoreMsg 3047 for seq := atomic.LoadUint64(&mb.first.seq); seq < last; seq++ { 3048 sm, _ := mb.cacheLookup(seq, &smv) 3049 if sm == nil || sm.subj == _EMPTY_ { 3050 continue 3051 } 3052 // Check if it matches our filter. 3053 if sm.seq < sseq && isMatch(sm.subj) { 3054 adjust++ 3055 } 3056 } 3057 } 3058 // If we loaded the block try to force expire. 3059 if shouldExpire { 3060 mb.tryForceExpireCacheLocked() 3061 } 3062 mb.mu.Unlock() 3063 } 3064 // Make final adjustment. 3065 total -= adjust 3066 3067 return total, validThrough 3068 } 3069 3070 // SubjectsTotal return message totals per subject. 3071 func (fs *fileStore) SubjectsTotals(filter string) map[string]uint64 { 3072 fs.mu.RLock() 3073 defer fs.mu.RUnlock() 3074 3075 if fs.psim.Size() == 0 { 3076 return nil 3077 } 3078 // Match all if no filter given. 3079 if filter == _EMPTY_ { 3080 filter = fwcs 3081 } 3082 fst := make(map[string]uint64) 3083 fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) { 3084 fst[string(subj)] = psi.total 3085 }) 3086 return fst 3087 } 3088 3089 // RegisterStorageUpdates registers a callback for updates to storage changes. 3090 // It will present number of messages and bytes as a signed integer and an 3091 // optional sequence number of the message if a single. 3092 func (fs *fileStore) RegisterStorageUpdates(cb StorageUpdateHandler) { 3093 fs.mu.Lock() 3094 fs.scb = cb 3095 bsz := fs.state.Bytes 3096 fs.mu.Unlock() 3097 if cb != nil && bsz > 0 { 3098 cb(0, int64(bsz), 0, _EMPTY_) 3099 } 3100 } 3101 3102 // Helper to get hash key for specific message block. 3103 // Lock should be held 3104 func (fs *fileStore) hashKeyForBlock(index uint32) []byte { 3105 return []byte(fmt.Sprintf("%s-%d", fs.cfg.Name, index)) 3106 } 3107 3108 func (mb *msgBlock) setupWriteCache(buf []byte) { 3109 // Make sure we have a cache setup. 3110 if mb.cache != nil { 3111 return 3112 } 3113 3114 // Setup simple cache. 3115 mb.cache = &cache{buf: buf} 3116 // Make sure we set the proper cache offset if we have existing data. 3117 var fi os.FileInfo 3118 if mb.mfd != nil { 3119 fi, _ = mb.mfd.Stat() 3120 } else if mb.mfn != _EMPTY_ { 3121 fi, _ = os.Stat(mb.mfn) 3122 } 3123 if fi != nil { 3124 mb.cache.off = int(fi.Size()) 3125 } 3126 mb.llts = time.Now().UnixNano() 3127 mb.startCacheExpireTimer() 3128 } 3129 3130 // This rolls to a new append msg block. 3131 // Lock should be held. 3132 func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { 3133 index := uint32(1) 3134 var rbuf []byte 3135 3136 if lmb := fs.lmb; lmb != nil { 3137 index = lmb.index + 1 3138 // Determine if we can reclaim any resources here. 3139 if fs.fip { 3140 lmb.mu.Lock() 3141 lmb.closeFDsLocked() 3142 if lmb.cache != nil { 3143 // Reset write timestamp and see if we can expire this cache. 3144 rbuf = lmb.tryExpireWriteCache() 3145 } 3146 lmb.mu.Unlock() 3147 } 3148 } 3149 3150 mb := fs.initMsgBlock(index) 3151 // Lock should be held to quiet race detector. 3152 mb.mu.Lock() 3153 mb.setupWriteCache(rbuf) 3154 mb.fss = make(map[string]*SimpleState) 3155 3156 // Set cache time to creation time to start. 3157 ts := time.Now().UnixNano() 3158 mb.llts, mb.lwts = 0, ts 3159 // Remember our last sequence number. 3160 atomic.StoreUint64(&mb.first.seq, fs.state.LastSeq+1) 3161 atomic.StoreUint64(&mb.last.seq, fs.state.LastSeq) 3162 mb.mu.Unlock() 3163 3164 // Now do local hash. 3165 key := sha256.Sum256(fs.hashKeyForBlock(index)) 3166 hh, err := highwayhash.New64(key[:]) 3167 if err != nil { 3168 return nil, fmt.Errorf("could not create hash: %v", err) 3169 } 3170 mb.hh = hh 3171 3172 <-dios 3173 mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms) 3174 dios <- struct{}{} 3175 3176 if err != nil { 3177 mb.dirtyCloseWithRemove(true) 3178 return nil, fmt.Errorf("Error creating msg block file: %v", err) 3179 } 3180 mb.mfd = mfd 3181 3182 // Check if encryption is enabled. 3183 if fs.prf != nil { 3184 if err := fs.genEncryptionKeysForBlock(mb); err != nil { 3185 return nil, err 3186 } 3187 } 3188 3189 // If we know we will need this so go ahead and spin up. 3190 if !fs.fip { 3191 mb.spinUpFlushLoop() 3192 } 3193 3194 // Add to our list of blocks and mark as last. 3195 fs.addMsgBlock(mb) 3196 3197 return mb, nil 3198 } 3199 3200 // Generate the keys for this message block and write them out. 3201 func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error { 3202 if mb == nil { 3203 return nil 3204 } 3205 key, bek, seed, encrypted, err := fs.genEncryptionKeys(fmt.Sprintf("%s:%d", fs.cfg.Name, mb.index)) 3206 if err != nil { 3207 return err 3208 } 3209 mb.aek, mb.bek, mb.seed, mb.nonce = key, bek, seed, encrypted[:key.NonceSize()] 3210 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 3211 keyFile := filepath.Join(mdir, fmt.Sprintf(keyScan, mb.index)) 3212 if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) { 3213 return err 3214 } 3215 <-dios 3216 err = os.WriteFile(keyFile, encrypted, defaultFilePerms) 3217 dios <- struct{}{} 3218 if err != nil { 3219 return err 3220 } 3221 mb.kfn = keyFile 3222 return nil 3223 } 3224 3225 // Stores a raw message with expected sequence number and timestamp. 3226 // Lock should be held. 3227 func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) (err error) { 3228 if fs.closed { 3229 return ErrStoreClosed 3230 } 3231 3232 // Per subject max check needed. 3233 mmp := uint64(fs.cfg.MaxMsgsPer) 3234 var psmc uint64 3235 psmax := mmp > 0 && len(subj) > 0 3236 if psmax { 3237 if info, ok := fs.psim.Find(stringToBytes(subj)); ok { 3238 psmc = info.total 3239 } 3240 } 3241 3242 var fseq uint64 3243 // Check if we are discarding new messages when we reach the limit. 3244 if fs.cfg.Discard == DiscardNew { 3245 var asl bool 3246 if psmax && psmc >= mmp { 3247 // If we are instructed to discard new per subject, this is an error. 3248 if fs.cfg.DiscardNewPer { 3249 return ErrMaxMsgsPerSubject 3250 } 3251 if fseq, err = fs.firstSeqForSubj(subj); err != nil { 3252 return err 3253 } 3254 asl = true 3255 } 3256 // If we are discard new and limits policy and clustered, we do the enforcement 3257 // above and should not disqualify the message here since it could cause replicas to drift. 3258 if fs.cfg.Retention == LimitsPolicy || fs.cfg.Replicas == 1 { 3259 if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl { 3260 return ErrMaxMsgs 3261 } 3262 if fs.cfg.MaxBytes > 0 && fs.state.Bytes+fileStoreMsgSize(subj, hdr, msg) >= uint64(fs.cfg.MaxBytes) { 3263 if !asl || fs.sizeForSeq(fseq) <= int(fileStoreMsgSize(subj, hdr, msg)) { 3264 return ErrMaxBytes 3265 } 3266 } 3267 } 3268 } 3269 3270 // Check sequence. 3271 if seq != fs.state.LastSeq+1 { 3272 if seq > 0 { 3273 return ErrSequenceMismatch 3274 } 3275 seq = fs.state.LastSeq + 1 3276 } 3277 3278 // Write msg record. 3279 n, err := fs.writeMsgRecord(seq, ts, subj, hdr, msg) 3280 if err != nil { 3281 return err 3282 } 3283 3284 // Adjust top level tracking of per subject msg counts. 3285 if len(subj) > 0 && fs.psim != nil { 3286 index := fs.lmb.index 3287 if info, ok := fs.psim.Find(stringToBytes(subj)); ok { 3288 info.total++ 3289 if index > info.lblk { 3290 info.lblk = index 3291 } 3292 } else { 3293 fs.psim.Insert(stringToBytes(subj), psi{total: 1, fblk: index, lblk: index}) 3294 fs.tsl += len(subj) 3295 } 3296 } 3297 3298 // Adjust first if needed. 3299 now := time.Unix(0, ts).UTC() 3300 if fs.state.Msgs == 0 { 3301 fs.state.FirstSeq = seq 3302 fs.state.FirstTime = now 3303 } 3304 3305 fs.state.Msgs++ 3306 fs.state.Bytes += n 3307 fs.state.LastSeq = seq 3308 fs.state.LastTime = now 3309 3310 // Enforce per message limits. 3311 // We snapshotted psmc before our actual write, so >= comparison needed. 3312 if psmax && psmc >= mmp { 3313 // We may have done this above. 3314 if fseq == 0 { 3315 fseq, _ = fs.firstSeqForSubj(subj) 3316 } 3317 if ok, _ := fs.removeMsgViaLimits(fseq); ok { 3318 // Make sure we are below the limit. 3319 if psmc--; psmc >= mmp { 3320 bsubj := stringToBytes(subj) 3321 for info, ok := fs.psim.Find(bsubj); ok && info.total > mmp; info, ok = fs.psim.Find(bsubj) { 3322 if seq, _ := fs.firstSeqForSubj(subj); seq > 0 { 3323 if ok, _ := fs.removeMsgViaLimits(seq); !ok { 3324 break 3325 } 3326 } else { 3327 break 3328 } 3329 } 3330 } 3331 } else if mb := fs.selectMsgBlock(fseq); mb != nil { 3332 // If we are here we could not remove fseq from above, so rebuild. 3333 var ld *LostStreamData 3334 if ld, _, _ = mb.rebuildState(); ld != nil { 3335 fs.rebuildStateLocked(ld) 3336 } 3337 } 3338 } 3339 3340 // Limits checks and enforcement. 3341 // If they do any deletions they will update the 3342 // byte count on their own, so no need to compensate. 3343 fs.enforceMsgLimit() 3344 fs.enforceBytesLimit() 3345 3346 // Check if we have and need the age expiration timer running. 3347 if fs.ageChk == nil && fs.cfg.MaxAge != 0 { 3348 fs.startAgeChk() 3349 } 3350 3351 return nil 3352 } 3353 3354 // StoreRawMsg stores a raw message with expected sequence number and timestamp. 3355 func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts int64) error { 3356 fs.mu.Lock() 3357 err := fs.storeRawMsg(subj, hdr, msg, seq, ts) 3358 cb := fs.scb 3359 // Check if first message timestamp requires expiry 3360 // sooner than initial replica expiry timer set to MaxAge when initializing. 3361 if !fs.receivedAny && fs.cfg.MaxAge != 0 && ts > 0 { 3362 fs.receivedAny = true 3363 // don't block here by calling expireMsgs directly. 3364 // Instead, set short timeout. 3365 fs.resetAgeChk(int64(time.Millisecond * 50)) 3366 } 3367 fs.mu.Unlock() 3368 3369 if err == nil && cb != nil { 3370 cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj) 3371 } 3372 3373 return err 3374 } 3375 3376 // Store stores a message. We hold the main filestore lock for any write operation. 3377 func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) { 3378 fs.mu.Lock() 3379 seq, ts := fs.state.LastSeq+1, time.Now().UnixNano() 3380 err := fs.storeRawMsg(subj, hdr, msg, seq, ts) 3381 cb := fs.scb 3382 fs.mu.Unlock() 3383 3384 if err != nil { 3385 seq, ts = 0, 0 3386 } else if cb != nil { 3387 cb(1, int64(fileStoreMsgSize(subj, hdr, msg)), seq, subj) 3388 } 3389 3390 return seq, ts, err 3391 } 3392 3393 // skipMsg will update this message block for a skipped message. 3394 // If we do not have any messages, just update the metadata, otherwise 3395 // we will place an empty record marking the sequence as used. The 3396 // sequence will be marked erased. 3397 // fs lock should be held. 3398 func (mb *msgBlock) skipMsg(seq uint64, now time.Time) { 3399 if mb == nil { 3400 return 3401 } 3402 var needsRecord bool 3403 3404 nowts := now.UnixNano() 3405 3406 mb.mu.Lock() 3407 // If we are empty can just do meta. 3408 if mb.msgs == 0 { 3409 atomic.StoreUint64(&mb.last.seq, seq) 3410 mb.last.ts = nowts 3411 atomic.StoreUint64(&mb.first.seq, seq+1) 3412 mb.first.ts = nowts 3413 } else { 3414 needsRecord = true 3415 mb.dmap.Insert(seq) 3416 } 3417 mb.mu.Unlock() 3418 3419 if needsRecord { 3420 mb.writeMsgRecord(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, nowts, true) 3421 } else { 3422 mb.kickFlusher() 3423 } 3424 } 3425 3426 // SkipMsg will use the next sequence number but not store anything. 3427 func (fs *fileStore) SkipMsg() uint64 { 3428 fs.mu.Lock() 3429 defer fs.mu.Unlock() 3430 3431 // Grab our current last message block. 3432 mb := fs.lmb 3433 if mb == nil || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize { 3434 if mb != nil && fs.fcfg.Compression != NoCompression { 3435 // We've now reached the end of this message block, if we want 3436 // to compress blocks then now's the time to do it. 3437 go mb.recompressOnDiskIfNeeded() 3438 } 3439 var err error 3440 if mb, err = fs.newMsgBlockForWrite(); err != nil { 3441 return 0 3442 } 3443 } 3444 3445 // Grab time and last seq. 3446 now, seq := time.Now().UTC(), fs.state.LastSeq+1 3447 3448 // Write skip msg. 3449 mb.skipMsg(seq, now) 3450 3451 // Update fs state. 3452 fs.state.LastSeq, fs.state.LastTime = seq, now 3453 if fs.state.Msgs == 0 { 3454 fs.state.FirstSeq, fs.state.FirstTime = seq, now 3455 } 3456 if seq == fs.state.FirstSeq { 3457 fs.state.FirstSeq, fs.state.FirstTime = seq+1, now 3458 } 3459 // Mark as dirty for stream state. 3460 fs.dirty++ 3461 3462 return seq 3463 } 3464 3465 // Skip multiple msgs. We will determine if we can fit into current lmb or we need to create a new block. 3466 func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error { 3467 fs.mu.Lock() 3468 defer fs.mu.Unlock() 3469 3470 // Check sequence matches our last sequence. 3471 if seq != fs.state.LastSeq+1 { 3472 if seq > 0 { 3473 return ErrSequenceMismatch 3474 } 3475 seq = fs.state.LastSeq + 1 3476 } 3477 3478 // Limit number of dmap entries 3479 const maxDeletes = 64 * 1024 3480 mb := fs.lmb 3481 3482 numDeletes := int(num) 3483 if mb != nil { 3484 numDeletes += mb.dmap.Size() 3485 } 3486 if mb == nil || numDeletes > maxDeletes && mb.msgs > 0 || mb.msgs > 0 && mb.blkSize()+emptyRecordLen > fs.fcfg.BlockSize { 3487 if mb != nil && fs.fcfg.Compression != NoCompression { 3488 // We've now reached the end of this message block, if we want 3489 // to compress blocks then now's the time to do it. 3490 go mb.recompressOnDiskIfNeeded() 3491 } 3492 var err error 3493 if mb, err = fs.newMsgBlockForWrite(); err != nil { 3494 return err 3495 } 3496 } 3497 3498 // Insert into dmap all entries and place last as marker. 3499 now := time.Now().UTC() 3500 nowts := now.UnixNano() 3501 lseq := seq + num - 1 3502 3503 mb.mu.Lock() 3504 var needsRecord bool 3505 // If we are empty update meta directly. 3506 if mb.msgs == 0 { 3507 atomic.StoreUint64(&mb.last.seq, lseq) 3508 mb.last.ts = nowts 3509 atomic.StoreUint64(&mb.first.seq, lseq+1) 3510 mb.first.ts = nowts 3511 } else { 3512 needsRecord = true 3513 for ; seq <= lseq; seq++ { 3514 mb.dmap.Insert(seq) 3515 } 3516 } 3517 mb.mu.Unlock() 3518 3519 // Write out our placeholder. 3520 if needsRecord { 3521 mb.writeMsgRecord(emptyRecordLen, lseq|ebit, _EMPTY_, nil, nil, nowts, true) 3522 } 3523 3524 // Now update FS accounting. 3525 // Update fs state. 3526 fs.state.LastSeq, fs.state.LastTime = lseq, now 3527 if fs.state.Msgs == 0 { 3528 fs.state.FirstSeq, fs.state.FirstTime = lseq+1, now 3529 } 3530 3531 // Mark as dirty for stream state. 3532 fs.dirty++ 3533 3534 return nil 3535 } 3536 3537 // Lock should be held. 3538 func (fs *fileStore) rebuildFirst() { 3539 if len(fs.blks) == 0 { 3540 return 3541 } 3542 fmb := fs.blks[0] 3543 if fmb == nil { 3544 return 3545 } 3546 3547 ld, _, _ := fmb.rebuildState() 3548 fmb.mu.RLock() 3549 isEmpty := fmb.msgs == 0 3550 fmb.mu.RUnlock() 3551 if isEmpty { 3552 fmb.mu.Lock() 3553 fs.removeMsgBlock(fmb) 3554 fmb.mu.Unlock() 3555 } 3556 fs.selectNextFirst() 3557 fs.rebuildStateLocked(ld) 3558 } 3559 3560 // Optimized helper function to return first sequence. 3561 // subj will always be publish subject here, meaning non-wildcard. 3562 // We assume a fast check that this subj even exists already happened. 3563 // Lock should be held. 3564 func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) { 3565 if len(fs.blks) == 0 { 3566 return 0, nil 3567 } 3568 3569 // See if we can optimize where we start. 3570 start, stop := fs.blks[0].index, fs.lmb.index 3571 if info, ok := fs.psim.Find(stringToBytes(subj)); ok { 3572 start, stop = info.fblk, info.lblk 3573 } 3574 3575 for i := start; i <= stop; i++ { 3576 mb := fs.bim[i] 3577 if mb == nil { 3578 continue 3579 } 3580 mb.mu.Lock() 3581 var shouldExpire bool 3582 if mb.fssNotLoaded() { 3583 // Make sure we have fss loaded. 3584 if err := mb.loadMsgsWithLock(); err != nil { 3585 mb.mu.Unlock() 3586 return 0, err 3587 } 3588 shouldExpire = true 3589 } 3590 if ss := mb.fss[subj]; ss != nil { 3591 // Adjust first if it was not where we thought it should be. 3592 if i != start { 3593 if info, ok := fs.psim.Find(stringToBytes(subj)); ok { 3594 info.fblk = i 3595 } 3596 } 3597 if ss.firstNeedsUpdate { 3598 mb.recalculateFirstForSubj(subj, ss.First, ss) 3599 } 3600 mb.mu.Unlock() 3601 return ss.First, nil 3602 } 3603 // If we did not find it and we loaded this msgBlock try to expire as long as not the last. 3604 if shouldExpire { 3605 // Expire this cache before moving on. 3606 mb.tryForceExpireCacheLocked() 3607 } 3608 mb.mu.Unlock() 3609 } 3610 return 0, nil 3611 } 3612 3613 // Will check the msg limit and drop firstSeq msg if needed. 3614 // Lock should be held. 3615 func (fs *fileStore) enforceMsgLimit() { 3616 if fs.cfg.Discard != DiscardOld { 3617 return 3618 } 3619 if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) { 3620 return 3621 } 3622 for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs { 3623 if removed, err := fs.deleteFirstMsg(); err != nil || !removed { 3624 fs.rebuildFirst() 3625 return 3626 } 3627 } 3628 } 3629 3630 // Will check the bytes limit and drop msgs if needed. 3631 // Lock should be held. 3632 func (fs *fileStore) enforceBytesLimit() { 3633 if fs.cfg.Discard != DiscardOld { 3634 return 3635 } 3636 if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) { 3637 return 3638 } 3639 for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes { 3640 if removed, err := fs.deleteFirstMsg(); err != nil || !removed { 3641 fs.rebuildFirst() 3642 return 3643 } 3644 } 3645 } 3646 3647 // Will make sure we have limits honored for max msgs per subject on recovery or config update. 3648 // We will make sure to go through all msg blocks etc. but in practice this 3649 // will most likely only be the last one, so can take a more conservative approach. 3650 // Lock should be held. 3651 func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { 3652 maxMsgsPer := uint64(fs.cfg.MaxMsgsPer) 3653 3654 // We may want to suppress callbacks from remove during this process 3655 // since these should have already been deleted and accounted for. 3656 if !fireCallback { 3657 cb := fs.scb 3658 fs.scb = nil 3659 defer func() { fs.scb = cb }() 3660 } 3661 3662 var numMsgs uint64 3663 3664 // collect all that are not correct. 3665 needAttention := make(map[string]*psi) 3666 fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) { 3667 numMsgs += psi.total 3668 if psi.total > maxMsgsPer { 3669 needAttention[string(subj)] = psi 3670 } 3671 }) 3672 3673 // We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught. 3674 // So do a quick sanity check here. If we detect a skew do a rebuild then re-check. 3675 if numMsgs != fs.state.Msgs { 3676 fs.warn("Detected skew in subject-based total (%d) vs raw total (%d), rebuilding", numMsgs, fs.state.Msgs) 3677 // Clear any global subject state. 3678 fs.psim, fs.tsl = fs.psim.Empty(), 0 3679 for _, mb := range fs.blks { 3680 ld, _, err := mb.rebuildState() 3681 if err != nil && ld != nil { 3682 fs.addLostData(ld) 3683 } 3684 fs.populateGlobalPerSubjectInfo(mb) 3685 } 3686 // Rebuild fs state too. 3687 fs.rebuildStateLocked(nil) 3688 // Need to redo blocks that need attention. 3689 needAttention = make(map[string]*psi) 3690 fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) { 3691 if psi.total > maxMsgsPer { 3692 needAttention[string(subj)] = psi 3693 } 3694 }) 3695 } 3696 3697 // Collect all the msgBlks we alter. 3698 blks := make(map[*msgBlock]struct{}) 3699 3700 // For re-use below. 3701 var sm StoreMsg 3702 3703 // Walk all subjects that need attention here. 3704 for subj, info := range needAttention { 3705 total, start, stop := info.total, info.fblk, info.lblk 3706 3707 for i := start; i <= stop; i++ { 3708 mb := fs.bim[i] 3709 if mb == nil { 3710 continue 3711 } 3712 // Grab the ss entry for this subject in case sparse. 3713 mb.mu.Lock() 3714 mb.ensurePerSubjectInfoLoaded() 3715 ss := mb.fss[subj] 3716 if ss != nil && ss.firstNeedsUpdate { 3717 mb.recalculateFirstForSubj(subj, ss.First, ss) 3718 } 3719 mb.mu.Unlock() 3720 if ss == nil { 3721 continue 3722 } 3723 for seq := ss.First; seq <= ss.Last && total > maxMsgsPer; { 3724 m, _, err := mb.firstMatching(subj, false, seq, &sm) 3725 if err == nil { 3726 seq = m.seq + 1 3727 if removed, _ := fs.removeMsgViaLimits(m.seq); removed { 3728 total-- 3729 blks[mb] = struct{}{} 3730 } 3731 } else { 3732 // On error just do single increment. 3733 seq++ 3734 } 3735 } 3736 } 3737 } 3738 3739 // Expire the cache if we can. 3740 for mb := range blks { 3741 mb.mu.Lock() 3742 if mb.msgs > 0 { 3743 mb.tryForceExpireCacheLocked() 3744 } 3745 mb.mu.Unlock() 3746 } 3747 } 3748 3749 // Lock should be held. 3750 func (fs *fileStore) deleteFirstMsg() (bool, error) { 3751 return fs.removeMsgViaLimits(fs.state.FirstSeq) 3752 } 3753 3754 // If we remove via limits that can always be recovered on a restart we 3755 // do not force the system to update the index file. 3756 // Lock should be held. 3757 func (fs *fileStore) removeMsgViaLimits(seq uint64) (bool, error) { 3758 return fs.removeMsg(seq, false, true, false) 3759 } 3760 3761 // RemoveMsg will remove the message from this store. 3762 // Will return the number of bytes removed. 3763 func (fs *fileStore) RemoveMsg(seq uint64) (bool, error) { 3764 return fs.removeMsg(seq, false, false, true) 3765 } 3766 3767 func (fs *fileStore) EraseMsg(seq uint64) (bool, error) { 3768 return fs.removeMsg(seq, true, false, true) 3769 } 3770 3771 // Convenience function to remove per subject tracking at the filestore level. 3772 // Lock should be held. 3773 func (fs *fileStore) removePerSubject(subj string) { 3774 if len(subj) == 0 || fs.psim == nil { 3775 return 3776 } 3777 // We do not update sense of fblk here but will do so when we resolve during lookup. 3778 bsubj := stringToBytes(subj) 3779 if info, ok := fs.psim.Find(bsubj); ok { 3780 info.total-- 3781 if info.total == 1 { 3782 info.fblk = info.lblk 3783 } else if info.total == 0 { 3784 if _, ok = fs.psim.Delete(bsubj); ok { 3785 fs.tsl -= len(subj) 3786 } 3787 } 3788 } 3789 } 3790 3791 // Remove a message, optionally rewriting the mb file. 3792 func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) (bool, error) { 3793 if seq == 0 { 3794 return false, ErrStoreMsgNotFound 3795 } 3796 fsLock := func() { 3797 if needFSLock { 3798 fs.mu.Lock() 3799 } 3800 } 3801 fsUnlock := func() { 3802 if needFSLock { 3803 fs.mu.Unlock() 3804 } 3805 } 3806 3807 fsLock() 3808 3809 if fs.closed { 3810 fsUnlock() 3811 return false, ErrStoreClosed 3812 } 3813 if !viaLimits && fs.sips > 0 { 3814 fsUnlock() 3815 return false, ErrStoreSnapshotInProgress 3816 } 3817 // If in encrypted mode negate secure rewrite here. 3818 if secure && fs.prf != nil { 3819 secure = false 3820 } 3821 3822 mb := fs.selectMsgBlock(seq) 3823 if mb == nil { 3824 var err = ErrStoreEOF 3825 if seq <= fs.state.LastSeq { 3826 err = ErrStoreMsgNotFound 3827 } 3828 fsUnlock() 3829 return false, err 3830 } 3831 3832 mb.mu.Lock() 3833 3834 // See if we are closed or the sequence number is still relevant or if we know its deleted. 3835 if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) || mb.dmap.Exists(seq) { 3836 mb.mu.Unlock() 3837 fsUnlock() 3838 return false, nil 3839 } 3840 3841 // We used to not have to load in the messages except with callbacks or the filtered subject state (which is now always on). 3842 // Now just load regardless. 3843 // TODO(dlc) - Figure out a way not to have to load it in, we need subject tracking outside main data block. 3844 if mb.cacheNotLoaded() { 3845 if err := mb.loadMsgsWithLock(); err != nil { 3846 mb.mu.Unlock() 3847 fsUnlock() 3848 return false, err 3849 } 3850 } 3851 3852 var smv StoreMsg 3853 sm, err := mb.cacheLookup(seq, &smv) 3854 if err != nil { 3855 mb.mu.Unlock() 3856 fsUnlock() 3857 // Mimic err behavior from above check to dmap. No error returned if already removed. 3858 if err == errDeletedMsg { 3859 err = nil 3860 } 3861 return false, err 3862 } 3863 // Grab size 3864 msz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 3865 3866 // Set cache timestamp for last remove. 3867 mb.lrts = time.Now().UnixNano() 3868 3869 // Global stats 3870 if fs.state.Msgs > 0 { 3871 fs.state.Msgs-- 3872 } 3873 if msz < fs.state.Bytes { 3874 fs.state.Bytes -= msz 3875 } else { 3876 fs.state.Bytes = 0 3877 } 3878 3879 // Now local mb updates. 3880 if mb.msgs > 0 { 3881 mb.msgs-- 3882 } 3883 if msz < mb.bytes { 3884 mb.bytes -= msz 3885 } else { 3886 mb.bytes = 0 3887 } 3888 3889 // Mark as dirty for stream state. 3890 fs.dirty++ 3891 3892 // If we are tracking subjects here make sure we update that accounting. 3893 mb.ensurePerSubjectInfoLoaded() 3894 3895 // If we are tracking multiple subjects here make sure we update that accounting. 3896 mb.removeSeqPerSubject(sm.subj, seq) 3897 fs.removePerSubject(sm.subj) 3898 3899 if secure { 3900 // Grab record info. 3901 ri, rl, _, _ := mb.slotInfo(int(seq - mb.cache.fseq)) 3902 if err := mb.eraseMsg(seq, int(ri), int(rl)); err != nil { 3903 return false, err 3904 } 3905 } 3906 3907 fifo := seq == atomic.LoadUint64(&mb.first.seq) 3908 isLastBlock := mb == fs.lmb 3909 isEmpty := mb.msgs == 0 3910 3911 if fifo { 3912 mb.selectNextFirst() 3913 if !isEmpty { 3914 // Can update this one in place. 3915 if seq == fs.state.FirstSeq { 3916 fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one. 3917 if mb.first.ts == 0 { 3918 fs.state.FirstTime = time.Time{} 3919 } else { 3920 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 3921 } 3922 } 3923 } 3924 } else if !isEmpty { 3925 // Out of order delete. 3926 mb.dmap.Insert(seq) 3927 // Make simple check here similar to Compact(). If we can save 50% and over a certain threshold do inline. 3928 // All other more thorough cleanup will happen in syncBlocks logic. 3929 // Note that we do not have to store empty records for the deleted, so don't use to calculate. 3930 // TODO(dlc) - This should not be inline, should kick the sync routine. 3931 if mb.rbytes > compactMinimum && mb.bytes*2 < mb.rbytes && !isLastBlock { 3932 mb.compact() 3933 } 3934 } 3935 3936 if secure { 3937 if ld, _ := mb.flushPendingMsgsLocked(); ld != nil { 3938 // We have the mb lock here, this needs the mb locks so do in its own go routine. 3939 go fs.rebuildState(ld) 3940 } 3941 } 3942 3943 // If empty remove this block and check if we need to update first sequence. 3944 // We will write a tombstone at the end. 3945 var firstSeqNeedsUpdate bool 3946 if isEmpty { 3947 // This writes tombstone iff mb == lmb, so no need to do below. 3948 fs.removeMsgBlock(mb) 3949 firstSeqNeedsUpdate = seq == fs.state.FirstSeq 3950 } 3951 mb.mu.Unlock() 3952 3953 // If we emptied the current message block and the seq was state.FirstSeq 3954 // then we need to jump message blocks. We will also write the index so 3955 // we don't lose track of the first sequence. 3956 if firstSeqNeedsUpdate { 3957 fs.selectNextFirst() 3958 } 3959 3960 // Check if we need to write a deleted record tombstone. 3961 // This is for user initiated removes or to hold the first seq 3962 // when the last block is empty. 3963 3964 // If not via limits and not empty and last (empty writes tombstone above if last) write tombstone. 3965 if !viaLimits && !(isEmpty && isLastBlock) { 3966 if lmb := fs.lmb; sm != nil && lmb != nil { 3967 lmb.writeTombstone(sm.seq, sm.ts) 3968 } 3969 } 3970 3971 if cb := fs.scb; cb != nil { 3972 // If we have a callback registered we need to release lock regardless since cb might need it to lookup msg, etc. 3973 fs.mu.Unlock() 3974 // Storage updates. 3975 var subj string 3976 if sm != nil { 3977 subj = sm.subj 3978 } 3979 delta := int64(msz) 3980 cb(-1, -delta, seq, subj) 3981 3982 if !needFSLock { 3983 fs.mu.Lock() 3984 } 3985 } else if needFSLock { 3986 // We acquired it so release it. 3987 fs.mu.Unlock() 3988 } 3989 3990 return true, nil 3991 } 3992 3993 // This will compact and rewrite this block. This should only be called when we know we want to rewrite this block. 3994 // This should not be called on the lmb since we will prune tail deleted messages which could cause issues with 3995 // writing new messages. We will silently bail on any issues with the underlying block and let someone else detect. 3996 // Write lock needs to be held. 3997 func (mb *msgBlock) compact() { 3998 wasLoaded := mb.cacheAlreadyLoaded() 3999 if !wasLoaded { 4000 if err := mb.loadMsgsWithLock(); err != nil { 4001 return 4002 } 4003 } 4004 4005 buf := mb.cache.buf 4006 nbuf := getMsgBlockBuf(len(buf)) 4007 // Recycle our nbuf when we are done. 4008 defer recycleMsgBlockBuf(nbuf) 4009 4010 var le = binary.LittleEndian 4011 var firstSet bool 4012 4013 fseq := atomic.LoadUint64(&mb.first.seq) 4014 isDeleted := func(seq uint64) bool { 4015 return seq == 0 || seq&ebit != 0 || mb.dmap.Exists(seq) || seq < fseq 4016 } 4017 4018 for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { 4019 if index+msgHdrSize > lbuf { 4020 return 4021 } 4022 hdr := buf[index : index+msgHdrSize] 4023 rl, slen := le.Uint32(hdr[0:]), le.Uint16(hdr[20:]) 4024 // Clear any headers bit that could be set. 4025 rl &^= hbit 4026 dlen := int(rl) - msgHdrSize 4027 // Do some quick sanity checks here. 4028 if dlen < 0 || int(slen) > dlen || dlen > int(rl) || rl > rlBadThresh || index+rl > lbuf { 4029 return 4030 } 4031 // Only need to process non-deleted messages. 4032 seq := le.Uint64(hdr[4:]) 4033 4034 if !isDeleted(seq) { 4035 // Check for tombstones. 4036 if seq&tbit != 0 { 4037 // If we are last mb we should consider to keep these unless the tombstone reflects a seq in this mb. 4038 if mb == mb.fs.lmb && seq < fseq { 4039 nbuf = append(nbuf, buf[index:index+rl]...) 4040 } 4041 } else { 4042 // Normal message here. 4043 nbuf = append(nbuf, buf[index:index+rl]...) 4044 if !firstSet { 4045 firstSet = true 4046 atomic.StoreUint64(&mb.first.seq, seq) 4047 } 4048 } 4049 } 4050 // Advance to next record. 4051 index += rl 4052 } 4053 4054 // Handle compression 4055 if mb.cmp != NoCompression { 4056 cbuf, err := mb.cmp.Compress(nbuf) 4057 if err != nil { 4058 return 4059 } 4060 meta := &CompressionInfo{ 4061 Algorithm: mb.cmp, 4062 OriginalSize: uint64(len(nbuf)), 4063 } 4064 nbuf = append(meta.MarshalMetadata(), cbuf...) 4065 } 4066 4067 // Check for encryption. 4068 if mb.bek != nil && len(nbuf) > 0 { 4069 // Recreate to reset counter. 4070 rbek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 4071 if err != nil { 4072 return 4073 } 4074 rbek.XORKeyStream(nbuf, nbuf) 4075 } 4076 4077 // Close FDs first. 4078 mb.closeFDsLocked() 4079 4080 // We will write to a new file and mv/rename it in case of failure. 4081 mfn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(newScan, mb.index)) 4082 <-dios 4083 err := os.WriteFile(mfn, nbuf, defaultFilePerms) 4084 dios <- struct{}{} 4085 if err != nil { 4086 os.Remove(mfn) 4087 return 4088 } 4089 if err := os.Rename(mfn, mb.mfn); err != nil { 4090 os.Remove(mfn) 4091 return 4092 } 4093 4094 // Capture the updated rbytes. 4095 mb.rbytes = uint64(len(nbuf)) 4096 4097 // Remove any seqs from the beginning of the blk. 4098 for seq, nfseq := fseq, atomic.LoadUint64(&mb.first.seq); seq < nfseq; seq++ { 4099 mb.dmap.Delete(seq) 4100 } 4101 // Make sure we clear the cache since no longer valid. 4102 mb.clearCacheAndOffset() 4103 // If we entered with the msgs loaded make sure to reload them. 4104 if wasLoaded { 4105 mb.loadMsgsWithLock() 4106 } 4107 } 4108 4109 // Grab info from a slot. 4110 // Lock should be held. 4111 func (mb *msgBlock) slotInfo(slot int) (uint32, uint32, bool, error) { 4112 if mb.cache == nil || slot >= len(mb.cache.idx) { 4113 return 0, 0, false, errPartialCache 4114 } 4115 4116 bi := mb.cache.idx[slot] 4117 ri, hashChecked := (bi &^ hbit), (bi&hbit) != 0 4118 4119 // If this is a deleted slot return here. 4120 if bi == dbit { 4121 return 0, 0, false, errDeletedMsg 4122 } 4123 4124 // Determine record length 4125 var rl uint32 4126 if slot >= len(mb.cache.idx) { 4127 rl = mb.cache.lrl 4128 } else { 4129 // Need to account for dbit markers in idx. 4130 // So we will walk until we find valid idx slot to calculate rl. 4131 for i := 1; slot+i < len(mb.cache.idx); i++ { 4132 ni := mb.cache.idx[slot+i] &^ hbit 4133 if ni == dbit { 4134 continue 4135 } 4136 rl = ni - ri 4137 break 4138 } 4139 // check if we had all trailing dbits. 4140 // If so use len of cache buf minus ri. 4141 if rl == 0 { 4142 rl = uint32(len(mb.cache.buf)) - ri 4143 } 4144 } 4145 if rl < msgHdrSize { 4146 return 0, 0, false, errBadMsg 4147 } 4148 return uint32(ri), rl, hashChecked, nil 4149 } 4150 4151 func (fs *fileStore) isClosed() bool { 4152 fs.mu.RLock() 4153 closed := fs.closed 4154 fs.mu.RUnlock() 4155 return closed 4156 } 4157 4158 // Will spin up our flush loop. 4159 func (mb *msgBlock) spinUpFlushLoop() { 4160 mb.mu.Lock() 4161 defer mb.mu.Unlock() 4162 4163 // Are we already running or closed? 4164 if mb.flusher || mb.closed { 4165 return 4166 } 4167 mb.flusher = true 4168 mb.fch = make(chan struct{}, 1) 4169 mb.qch = make(chan struct{}) 4170 fch, qch := mb.fch, mb.qch 4171 4172 go mb.flushLoop(fch, qch) 4173 } 4174 4175 // Raw low level kicker for flush loops. 4176 func kickFlusher(fch chan struct{}) { 4177 if fch != nil { 4178 select { 4179 case fch <- struct{}{}: 4180 default: 4181 } 4182 } 4183 } 4184 4185 // Kick flusher for this message block. 4186 func (mb *msgBlock) kickFlusher() { 4187 mb.mu.RLock() 4188 defer mb.mu.RUnlock() 4189 kickFlusher(mb.fch) 4190 } 4191 4192 func (mb *msgBlock) setInFlusher() { 4193 mb.mu.Lock() 4194 mb.flusher = true 4195 mb.mu.Unlock() 4196 } 4197 4198 func (mb *msgBlock) clearInFlusher() { 4199 mb.mu.Lock() 4200 mb.flusher = false 4201 mb.mu.Unlock() 4202 } 4203 4204 // flushLoop watches for messages, index info, or recently closed msg block updates. 4205 func (mb *msgBlock) flushLoop(fch, qch chan struct{}) { 4206 mb.setInFlusher() 4207 defer mb.clearInFlusher() 4208 4209 for { 4210 select { 4211 case <-fch: 4212 // If we have pending messages process them first. 4213 if waiting := mb.pendingWriteSize(); waiting != 0 { 4214 ts := 1 * time.Millisecond 4215 var waited time.Duration 4216 4217 for waiting < coalesceMinimum { 4218 time.Sleep(ts) 4219 select { 4220 case <-qch: 4221 return 4222 default: 4223 } 4224 newWaiting := mb.pendingWriteSize() 4225 if waited = waited + ts; waited > maxFlushWait || newWaiting <= waiting { 4226 break 4227 } 4228 waiting = newWaiting 4229 ts *= 2 4230 } 4231 mb.flushPendingMsgs() 4232 // Check if we are no longer the last message block. If we are 4233 // not we can close FDs and exit. 4234 mb.fs.mu.RLock() 4235 notLast := mb != mb.fs.lmb 4236 mb.fs.mu.RUnlock() 4237 if notLast { 4238 if err := mb.closeFDs(); err == nil { 4239 return 4240 } 4241 } 4242 } 4243 case <-qch: 4244 return 4245 } 4246 } 4247 } 4248 4249 // Lock should be held. 4250 func (mb *msgBlock) eraseMsg(seq uint64, ri, rl int) error { 4251 var le = binary.LittleEndian 4252 var hdr [msgHdrSize]byte 4253 4254 le.PutUint32(hdr[0:], uint32(rl)) 4255 le.PutUint64(hdr[4:], seq|ebit) 4256 le.PutUint64(hdr[12:], 0) 4257 le.PutUint16(hdr[20:], 0) 4258 4259 // Randomize record 4260 data := make([]byte, rl-emptyRecordLen) 4261 if n, err := rand.Read(data); err != nil { 4262 return err 4263 } else if n != len(data) { 4264 return fmt.Errorf("not enough overwrite bytes read (%d != %d)", n, len(data)) 4265 } 4266 4267 // Now write to underlying buffer. 4268 var b bytes.Buffer 4269 b.Write(hdr[:]) 4270 b.Write(data) 4271 4272 // Calculate hash. 4273 mb.hh.Reset() 4274 mb.hh.Write(hdr[4:20]) 4275 mb.hh.Write(data) 4276 checksum := mb.hh.Sum(nil) 4277 // Write to msg record. 4278 b.Write(checksum) 4279 4280 // Update both cache and disk. 4281 nbytes := b.Bytes() 4282 4283 // Cache 4284 if ri >= mb.cache.off { 4285 li := ri - mb.cache.off 4286 buf := mb.cache.buf[li : li+rl] 4287 copy(buf, nbytes) 4288 } 4289 4290 // Disk 4291 if mb.cache.off+mb.cache.wp > ri { 4292 <-dios 4293 mfd, err := os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms) 4294 dios <- struct{}{} 4295 if err != nil { 4296 return err 4297 } 4298 defer mfd.Close() 4299 if _, err = mfd.WriteAt(nbytes, int64(ri)); err == nil { 4300 mfd.Sync() 4301 } 4302 if err != nil { 4303 return err 4304 } 4305 } 4306 return nil 4307 } 4308 4309 // Truncate this message block to the storedMsg. 4310 func (mb *msgBlock) truncate(sm *StoreMsg) (nmsgs, nbytes uint64, err error) { 4311 mb.mu.Lock() 4312 defer mb.mu.Unlock() 4313 4314 // Make sure we are loaded to process messages etc. 4315 if err := mb.loadMsgsWithLock(); err != nil { 4316 return 0, 0, err 4317 } 4318 4319 // Calculate new eof using slot info from our new last sm. 4320 ri, rl, _, err := mb.slotInfo(int(sm.seq - mb.cache.fseq)) 4321 if err != nil { 4322 return 0, 0, err 4323 } 4324 // Calculate new eof. 4325 eof := int64(ri + rl) 4326 4327 var purged, bytes uint64 4328 4329 checkDmap := mb.dmap.Size() > 0 4330 var smv StoreMsg 4331 4332 for seq := atomic.LoadUint64(&mb.last.seq); seq > sm.seq; seq-- { 4333 if checkDmap { 4334 if mb.dmap.Exists(seq) { 4335 // Delete and skip to next. 4336 mb.dmap.Delete(seq) 4337 checkDmap = !mb.dmap.IsEmpty() 4338 continue 4339 } 4340 } 4341 // We should have a valid msg to calculate removal stats. 4342 if m, err := mb.cacheLookup(seq, &smv); err == nil { 4343 if mb.msgs > 0 { 4344 rl := fileStoreMsgSize(m.subj, m.hdr, m.msg) 4345 mb.msgs-- 4346 if rl > mb.bytes { 4347 rl = mb.bytes 4348 } 4349 mb.bytes -= rl 4350 mb.rbytes -= rl 4351 // For return accounting. 4352 purged++ 4353 bytes += uint64(rl) 4354 } 4355 } 4356 } 4357 4358 // If the block is compressed then we have to load it into memory 4359 // and decompress it, truncate it and then write it back out. 4360 // Otherwise, truncate the file itself and close the descriptor. 4361 if mb.cmp != NoCompression { 4362 buf, err := mb.loadBlock(nil) 4363 if err != nil { 4364 return 0, 0, fmt.Errorf("failed to load block from disk: %w", err) 4365 } 4366 if mb.bek != nil && len(buf) > 0 { 4367 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 4368 if err != nil { 4369 return 0, 0, err 4370 } 4371 mb.bek = bek 4372 mb.bek.XORKeyStream(buf, buf) 4373 } 4374 buf, err = mb.decompressIfNeeded(buf) 4375 if err != nil { 4376 return 0, 0, fmt.Errorf("failed to decompress block: %w", err) 4377 } 4378 buf = buf[:eof] 4379 copy(mb.lchk[0:], buf[:len(buf)-checksumSize]) 4380 buf, err = mb.cmp.Compress(buf) 4381 if err != nil { 4382 return 0, 0, fmt.Errorf("failed to recompress block: %w", err) 4383 } 4384 meta := &CompressionInfo{ 4385 Algorithm: mb.cmp, 4386 OriginalSize: uint64(eof), 4387 } 4388 buf = append(meta.MarshalMetadata(), buf...) 4389 if mb.bek != nil && len(buf) > 0 { 4390 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 4391 if err != nil { 4392 return 0, 0, err 4393 } 4394 mb.bek = bek 4395 mb.bek.XORKeyStream(buf, buf) 4396 } 4397 n, err := mb.writeAt(buf, 0) 4398 if err != nil { 4399 return 0, 0, fmt.Errorf("failed to rewrite compressed block: %w", err) 4400 } 4401 if n != len(buf) { 4402 return 0, 0, fmt.Errorf("short write (%d != %d)", n, len(buf)) 4403 } 4404 mb.mfd.Truncate(int64(len(buf))) 4405 mb.mfd.Sync() 4406 } else if mb.mfd != nil { 4407 mb.mfd.Truncate(eof) 4408 mb.mfd.Sync() 4409 // Update our checksum. 4410 var lchk [8]byte 4411 mb.mfd.ReadAt(lchk[:], eof-8) 4412 copy(mb.lchk[0:], lchk[:]) 4413 } else { 4414 return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index) 4415 } 4416 4417 // Update our last msg. 4418 atomic.StoreUint64(&mb.last.seq, sm.seq) 4419 mb.last.ts = sm.ts 4420 4421 // Clear our cache. 4422 mb.clearCacheAndOffset() 4423 4424 // Redo per subject info for this block. 4425 mb.resetPerSubjectInfo() 4426 4427 // Load msgs again. 4428 mb.loadMsgsWithLock() 4429 4430 return purged, bytes, nil 4431 } 4432 4433 // Helper to determine if the mb is empty. 4434 func (mb *msgBlock) isEmpty() bool { 4435 return atomic.LoadUint64(&mb.first.seq) > atomic.LoadUint64(&mb.last.seq) 4436 } 4437 4438 // Lock should be held. 4439 func (mb *msgBlock) selectNextFirst() { 4440 var seq uint64 4441 fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 4442 for seq = fseq + 1; seq <= lseq; seq++ { 4443 if mb.dmap.Exists(seq) { 4444 // We will move past this so we can delete the entry. 4445 mb.dmap.Delete(seq) 4446 } else { 4447 break 4448 } 4449 } 4450 // Set new first sequence. 4451 atomic.StoreUint64(&mb.first.seq, seq) 4452 4453 // Check if we are empty.. 4454 if seq > lseq { 4455 mb.first.ts = 0 4456 return 4457 } 4458 4459 // Need to get the timestamp. 4460 // We will try the cache direct and fallback if needed. 4461 var smv StoreMsg 4462 sm, _ := mb.cacheLookup(seq, &smv) 4463 if sm == nil { 4464 // Slow path, need to unlock. 4465 mb.mu.Unlock() 4466 sm, _, _ = mb.fetchMsg(seq, &smv) 4467 mb.mu.Lock() 4468 } 4469 if sm != nil { 4470 mb.first.ts = sm.ts 4471 } else { 4472 mb.first.ts = 0 4473 } 4474 } 4475 4476 // Select the next FirstSeq 4477 // Lock should be held. 4478 func (fs *fileStore) selectNextFirst() { 4479 if len(fs.blks) > 0 { 4480 mb := fs.blks[0] 4481 mb.mu.RLock() 4482 fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) 4483 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 4484 mb.mu.RUnlock() 4485 } else { 4486 // Could not find anything, so treat like purge 4487 fs.state.FirstSeq = fs.state.LastSeq + 1 4488 fs.state.FirstTime = time.Time{} 4489 } 4490 } 4491 4492 // Lock should be held. 4493 func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) { 4494 if td == 0 { 4495 td = mb.cexp + 100*time.Millisecond 4496 } 4497 if mb.ctmr == nil { 4498 mb.ctmr = time.AfterFunc(td, mb.expireCache) 4499 } else { 4500 mb.ctmr.Reset(td) 4501 } 4502 } 4503 4504 // Lock should be held. 4505 func (mb *msgBlock) startCacheExpireTimer() { 4506 mb.resetCacheExpireTimer(0) 4507 } 4508 4509 // Used when we load in a message block. 4510 // Lock should be held. 4511 func (mb *msgBlock) clearCacheAndOffset() { 4512 // Reset linear scan tracker. 4513 mb.llseq = 0 4514 if mb.cache != nil { 4515 mb.cache.off = 0 4516 mb.cache.wp = 0 4517 } 4518 mb.clearCache() 4519 } 4520 4521 // Lock should be held. 4522 func (mb *msgBlock) clearCache() { 4523 if mb.ctmr != nil && mb.fss == nil { 4524 mb.ctmr.Stop() 4525 mb.ctmr = nil 4526 } 4527 4528 if mb.cache == nil { 4529 return 4530 } 4531 4532 buf := mb.cache.buf 4533 if mb.cache.off == 0 { 4534 mb.cache = nil 4535 } else { 4536 // Clear msgs and index. 4537 mb.cache.buf = nil 4538 mb.cache.idx = nil 4539 mb.cache.wp = 0 4540 } 4541 recycleMsgBlockBuf(buf) 4542 } 4543 4544 // Called to possibly expire a message block cache. 4545 func (mb *msgBlock) expireCache() { 4546 mb.mu.Lock() 4547 defer mb.mu.Unlock() 4548 mb.expireCacheLocked() 4549 } 4550 4551 func (mb *msgBlock) tryForceExpireCache() { 4552 mb.mu.Lock() 4553 defer mb.mu.Unlock() 4554 mb.tryForceExpireCacheLocked() 4555 } 4556 4557 // We will attempt to force expire this by temporarily clearing the last load time. 4558 func (mb *msgBlock) tryForceExpireCacheLocked() { 4559 llts := mb.llts 4560 mb.llts = 0 4561 mb.expireCacheLocked() 4562 mb.llts = llts 4563 } 4564 4565 // This is for expiration of the write cache, which will be partial with fip. 4566 // So we want to bypass the Pools here. 4567 // Lock should be held. 4568 func (mb *msgBlock) tryExpireWriteCache() []byte { 4569 if mb.cache == nil { 4570 return nil 4571 } 4572 lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra 4573 mb.lwts, mb.cache.nra = 0, true 4574 mb.expireCacheLocked() 4575 mb.lwts = lwts 4576 if mb.cache != nil { 4577 mb.cache.nra = nra 4578 } 4579 // We could check for a certain time since last load, but to be safe just reuse if no loads at all. 4580 if llts == 0 && (mb.cache == nil || mb.cache.buf == nil) { 4581 // Clear last write time since we now are about to move on to a new lmb. 4582 mb.lwts = 0 4583 return buf[:0] 4584 } 4585 return nil 4586 } 4587 4588 // Lock should be held. 4589 func (mb *msgBlock) expireCacheLocked() { 4590 if mb.cache == nil { 4591 if mb.ctmr != nil { 4592 mb.ctmr.Stop() 4593 mb.ctmr = nil 4594 } 4595 return 4596 } 4597 4598 // Can't expire if we still have pending. 4599 if mb.cache != nil && len(mb.cache.buf)-int(mb.cache.wp) > 0 { 4600 mb.resetCacheExpireTimer(mb.cexp) 4601 return 4602 } 4603 4604 // Grab timestamp to compare. 4605 tns := time.Now().UnixNano() 4606 4607 // For the core buffer of messages, we care about reads and writes, but not removes. 4608 bufts := mb.llts 4609 if mb.lwts > bufts { 4610 bufts = mb.lwts 4611 } 4612 4613 // Check for activity on the cache that would prevent us from expiring. 4614 if tns-bufts <= int64(mb.cexp) { 4615 mb.resetCacheExpireTimer(mb.cexp - time.Duration(tns-bufts)) 4616 return 4617 } 4618 4619 // If we are here we will at least expire the core msg buffer. 4620 // We need to capture offset in case we do a write next before a full load. 4621 if mb.cache != nil { 4622 mb.cache.off += len(mb.cache.buf) 4623 if !mb.cache.nra { 4624 recycleMsgBlockBuf(mb.cache.buf) 4625 } 4626 mb.cache.buf = nil 4627 mb.cache.wp = 0 4628 } 4629 4630 // Check if we can clear out our idx unless under force expire. 4631 // fss we keep longer and expire under sync timer checks. 4632 mb.clearCache() 4633 } 4634 4635 func (fs *fileStore) startAgeChk() { 4636 if fs.ageChk == nil && fs.cfg.MaxAge != 0 { 4637 fs.ageChk = time.AfterFunc(fs.cfg.MaxAge, fs.expireMsgs) 4638 } 4639 } 4640 4641 // Lock should be held. 4642 func (fs *fileStore) resetAgeChk(delta int64) { 4643 if fs.cfg.MaxAge == 0 { 4644 return 4645 } 4646 4647 fireIn := fs.cfg.MaxAge 4648 if delta > 0 && time.Duration(delta) < fireIn { 4649 if fireIn = time.Duration(delta); fireIn < 250*time.Millisecond { 4650 // Only fire at most once every 250ms. 4651 // Excessive firing can effect ingest performance. 4652 fireIn = time.Second 4653 } 4654 } 4655 if fs.ageChk != nil { 4656 fs.ageChk.Reset(fireIn) 4657 } else { 4658 fs.ageChk = time.AfterFunc(fireIn, fs.expireMsgs) 4659 } 4660 } 4661 4662 // Lock should be held. 4663 func (fs *fileStore) cancelAgeChk() { 4664 if fs.ageChk != nil { 4665 fs.ageChk.Stop() 4666 fs.ageChk = nil 4667 } 4668 } 4669 4670 // Will expire msgs that are too old. 4671 func (fs *fileStore) expireMsgs() { 4672 // We need to delete one by one here and can not optimize for the time being. 4673 // Reason is that we need more information to adjust ack pending in consumers. 4674 var smv StoreMsg 4675 var sm *StoreMsg 4676 fs.mu.RLock() 4677 maxAge := int64(fs.cfg.MaxAge) 4678 minAge := time.Now().UnixNano() - maxAge 4679 fs.mu.RUnlock() 4680 4681 for sm, _ = fs.msgForSeq(0, &smv); sm != nil && sm.ts <= minAge; sm, _ = fs.msgForSeq(0, &smv) { 4682 fs.mu.Lock() 4683 fs.removeMsgViaLimits(sm.seq) 4684 fs.mu.Unlock() 4685 // Recalculate in case we are expiring a bunch. 4686 minAge = time.Now().UnixNano() - maxAge 4687 } 4688 4689 fs.mu.Lock() 4690 defer fs.mu.Unlock() 4691 4692 // Onky cancel if no message left, not on potential lookup error that would result in sm == nil. 4693 if fs.state.Msgs == 0 { 4694 fs.cancelAgeChk() 4695 } else { 4696 if sm == nil { 4697 fs.resetAgeChk(0) 4698 } else { 4699 fs.resetAgeChk(sm.ts - minAge) 4700 } 4701 } 4702 } 4703 4704 // Lock should be held. 4705 func (fs *fileStore) checkAndFlushAllBlocks() { 4706 for _, mb := range fs.blks { 4707 if mb.pendingWriteSize() > 0 { 4708 // Since fs lock is held need to pull this apart in case we need to rebuild state. 4709 mb.mu.Lock() 4710 ld, _ := mb.flushPendingMsgsLocked() 4711 mb.mu.Unlock() 4712 if ld != nil { 4713 fs.rebuildStateLocked(ld) 4714 } 4715 } 4716 } 4717 } 4718 4719 // This will check all the checksums on messages and report back any sequence numbers with errors. 4720 func (fs *fileStore) checkMsgs() *LostStreamData { 4721 fs.mu.Lock() 4722 defer fs.mu.Unlock() 4723 4724 fs.checkAndFlushAllBlocks() 4725 4726 // Clear any global subject state. 4727 fs.psim, fs.tsl = fs.psim.Empty(), 0 4728 4729 for _, mb := range fs.blks { 4730 // Make sure encryption loaded if needed for the block. 4731 fs.loadEncryptionForMsgBlock(mb) 4732 // FIXME(dlc) - check tombstones here too? 4733 if ld, _, err := mb.rebuildState(); err != nil && ld != nil { 4734 // Rebuild fs state too. 4735 fs.rebuildStateLocked(ld) 4736 } 4737 fs.populateGlobalPerSubjectInfo(mb) 4738 } 4739 4740 return fs.ld 4741 } 4742 4743 // Lock should be held. 4744 func (mb *msgBlock) enableForWriting(fip bool) error { 4745 if mb == nil { 4746 return errNoMsgBlk 4747 } 4748 if mb.mfd != nil { 4749 return nil 4750 } 4751 <-dios 4752 mfd, err := os.OpenFile(mb.mfn, os.O_CREATE|os.O_RDWR, defaultFilePerms) 4753 dios <- struct{}{} 4754 if err != nil { 4755 return fmt.Errorf("error opening msg block file [%q]: %v", mb.mfn, err) 4756 } 4757 mb.mfd = mfd 4758 4759 // Spin up our flusher loop if needed. 4760 if !fip { 4761 mb.spinUpFlushLoop() 4762 } 4763 4764 return nil 4765 } 4766 4767 // Helper function to place a delete tombstone. 4768 func (mb *msgBlock) writeTombstone(seq uint64, ts int64) error { 4769 return mb.writeMsgRecord(emptyRecordLen, seq|tbit, _EMPTY_, nil, nil, ts, true) 4770 } 4771 4772 // Will write the message record to the underlying message block. 4773 // filestore lock will be held. 4774 func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush bool) error { 4775 mb.mu.Lock() 4776 defer mb.mu.Unlock() 4777 4778 // Enable for writing if our mfd is not open. 4779 if mb.mfd == nil { 4780 if err := mb.enableForWriting(flush); err != nil { 4781 return err 4782 } 4783 } 4784 4785 // Make sure we have a cache setup. 4786 if mb.cache == nil { 4787 mb.setupWriteCache(nil) 4788 } 4789 4790 // Check if we are tracking per subject for our simple state. 4791 // Do this before changing the cache that would trigger a flush pending msgs call 4792 // if we needed to regenerate the per subject info. 4793 // Note that tombstones have no subject so will not trigger here. 4794 if len(subj) > 0 && !mb.noTrack { 4795 if err := mb.ensurePerSubjectInfoLoaded(); err != nil { 4796 return err 4797 } 4798 if ss := mb.fss[subj]; ss != nil { 4799 ss.Msgs++ 4800 ss.Last = seq 4801 } else { 4802 mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} 4803 } 4804 } 4805 4806 // Indexing 4807 index := len(mb.cache.buf) + int(mb.cache.off) 4808 4809 // Formats 4810 // Format with no header 4811 // total_len(4) sequence(8) timestamp(8) subj_len(2) subj msg hash(8) 4812 // With headers, high bit on total length will be set. 4813 // total_len(4) sequence(8) timestamp(8) subj_len(2) subj hdr_len(4) hdr msg hash(8) 4814 4815 // First write header, etc. 4816 var le = binary.LittleEndian 4817 var hdr [msgHdrSize]byte 4818 4819 l := uint32(rl) 4820 hasHeaders := len(mhdr) > 0 4821 if hasHeaders { 4822 l |= hbit 4823 } 4824 4825 le.PutUint32(hdr[0:], l) 4826 le.PutUint64(hdr[4:], seq) 4827 le.PutUint64(hdr[12:], uint64(ts)) 4828 le.PutUint16(hdr[20:], uint16(len(subj))) 4829 4830 // Now write to underlying buffer. 4831 mb.cache.buf = append(mb.cache.buf, hdr[:]...) 4832 mb.cache.buf = append(mb.cache.buf, subj...) 4833 4834 if hasHeaders { 4835 var hlen [4]byte 4836 le.PutUint32(hlen[0:], uint32(len(mhdr))) 4837 mb.cache.buf = append(mb.cache.buf, hlen[:]...) 4838 mb.cache.buf = append(mb.cache.buf, mhdr...) 4839 } 4840 mb.cache.buf = append(mb.cache.buf, msg...) 4841 4842 // Calculate hash. 4843 mb.hh.Reset() 4844 mb.hh.Write(hdr[4:20]) 4845 mb.hh.Write([]byte(subj)) 4846 if hasHeaders { 4847 mb.hh.Write(mhdr) 4848 } 4849 mb.hh.Write(msg) 4850 checksum := mb.hh.Sum(nil) 4851 // Grab last checksum 4852 copy(mb.lchk[0:], checksum) 4853 4854 // Update write through cache. 4855 // Write to msg record. 4856 mb.cache.buf = append(mb.cache.buf, checksum...) 4857 mb.cache.lrl = uint32(rl) 4858 4859 // Set cache timestamp for last store. 4860 mb.lwts = ts 4861 4862 // Only update index and do accounting if not a delete tombstone. 4863 if seq&tbit == 0 { 4864 // Accounting, do this before stripping ebit, it is ebit aware. 4865 mb.updateAccounting(seq, ts, rl) 4866 // Strip ebit if set. 4867 seq = seq &^ ebit 4868 if mb.cache.fseq == 0 { 4869 mb.cache.fseq = seq 4870 } 4871 // Write index 4872 mb.cache.idx = append(mb.cache.idx, uint32(index)|hbit) 4873 } 4874 4875 fch, werr := mb.fch, mb.werr 4876 4877 // If we should be flushing, or had a write error, do so here. 4878 if flush || werr != nil { 4879 ld, err := mb.flushPendingMsgsLocked() 4880 if ld != nil && mb.fs != nil { 4881 // We have the mb lock here, this needs the mb locks so do in its own go routine. 4882 go mb.fs.rebuildState(ld) 4883 } 4884 if err != nil { 4885 return err 4886 } 4887 } else { 4888 // Kick the flusher here. 4889 kickFlusher(fch) 4890 } 4891 4892 return nil 4893 } 4894 4895 // How many bytes pending to be written for this message block. 4896 func (mb *msgBlock) pendingWriteSize() int { 4897 if mb == nil { 4898 return 0 4899 } 4900 mb.mu.RLock() 4901 defer mb.mu.RUnlock() 4902 return mb.pendingWriteSizeLocked() 4903 } 4904 4905 // How many bytes pending to be written for this message block. 4906 func (mb *msgBlock) pendingWriteSizeLocked() int { 4907 if mb == nil { 4908 return 0 4909 } 4910 var pending int 4911 if !mb.closed && mb.mfd != nil && mb.cache != nil { 4912 pending = len(mb.cache.buf) - int(mb.cache.wp) 4913 } 4914 return pending 4915 } 4916 4917 // Try to close our FDs if we can. 4918 func (mb *msgBlock) closeFDs() error { 4919 mb.mu.Lock() 4920 defer mb.mu.Unlock() 4921 return mb.closeFDsLocked() 4922 } 4923 4924 func (mb *msgBlock) closeFDsLocked() error { 4925 if buf, _ := mb.bytesPending(); len(buf) > 0 { 4926 return errPendingData 4927 } 4928 mb.closeFDsLockedNoCheck() 4929 return nil 4930 } 4931 4932 func (mb *msgBlock) closeFDsLockedNoCheck() { 4933 if mb.mfd != nil { 4934 mb.mfd.Close() 4935 mb.mfd = nil 4936 } 4937 } 4938 4939 // bytesPending returns the buffer to be used for writing to the underlying file. 4940 // This marks we are in flush and will return nil if asked again until cleared. 4941 // Lock should be held. 4942 func (mb *msgBlock) bytesPending() ([]byte, error) { 4943 if mb == nil || mb.mfd == nil { 4944 return nil, errNoPending 4945 } 4946 if mb.cache == nil { 4947 return nil, errNoCache 4948 } 4949 if len(mb.cache.buf) <= mb.cache.wp { 4950 return nil, errNoPending 4951 } 4952 buf := mb.cache.buf[mb.cache.wp:] 4953 if len(buf) == 0 { 4954 return nil, errNoPending 4955 } 4956 return buf, nil 4957 } 4958 4959 // Returns the current blkSize including deleted msgs etc. 4960 func (mb *msgBlock) blkSize() uint64 { 4961 mb.mu.RLock() 4962 nb := mb.rbytes 4963 mb.mu.RUnlock() 4964 return nb 4965 } 4966 4967 // Update accounting on a write msg. 4968 // Lock should be held. 4969 func (mb *msgBlock) updateAccounting(seq uint64, ts int64, rl uint64) { 4970 isDeleted := seq&ebit != 0 4971 if isDeleted { 4972 seq = seq &^ ebit 4973 } 4974 4975 fseq := atomic.LoadUint64(&mb.first.seq) 4976 if (fseq == 0 || mb.first.ts == 0) && seq >= fseq { 4977 atomic.StoreUint64(&mb.first.seq, seq) 4978 mb.first.ts = ts 4979 } 4980 // Need atomics here for selectMsgBlock speed. 4981 atomic.StoreUint64(&mb.last.seq, seq) 4982 mb.last.ts = ts 4983 mb.rbytes += rl 4984 if !isDeleted { 4985 mb.bytes += rl 4986 mb.msgs++ 4987 } 4988 } 4989 4990 // Lock should be held. 4991 func (fs *fileStore) writeMsgRecord(seq uint64, ts int64, subj string, hdr, msg []byte) (uint64, error) { 4992 var err error 4993 4994 // Get size for this message. 4995 rl := fileStoreMsgSize(subj, hdr, msg) 4996 if rl&hbit != 0 { 4997 return 0, ErrMsgTooLarge 4998 } 4999 // Grab our current last message block. 5000 mb := fs.lmb 5001 5002 // Mark as dirty for stream state. 5003 fs.dirty++ 5004 5005 if mb == nil || mb.msgs > 0 && mb.blkSize()+rl > fs.fcfg.BlockSize { 5006 if mb != nil && fs.fcfg.Compression != NoCompression { 5007 // We've now reached the end of this message block, if we want 5008 // to compress blocks then now's the time to do it. 5009 go mb.recompressOnDiskIfNeeded() 5010 } 5011 if mb, err = fs.newMsgBlockForWrite(); err != nil { 5012 return 0, err 5013 } 5014 } 5015 5016 // Ask msg block to store in write through cache. 5017 err = mb.writeMsgRecord(rl, seq, subj, hdr, msg, ts, fs.fip) 5018 5019 return rl, err 5020 } 5021 5022 func (mb *msgBlock) recompressOnDiskIfNeeded() error { 5023 alg := mb.fs.fcfg.Compression 5024 mb.mu.Lock() 5025 defer mb.mu.Unlock() 5026 5027 origFN := mb.mfn // The original message block on disk. 5028 tmpFN := mb.mfn + compressTmpSuffix // The compressed block will be written here. 5029 5030 // Open up the file block and read in the entire contents into memory. 5031 // One of two things will happen: 5032 // 1. The block will be compressed already and have a valid metadata 5033 // header, in which case we do nothing. 5034 // 2. The block will be uncompressed, in which case we will compress it 5035 // and then write it back out to disk, reencrypting if necessary. 5036 <-dios 5037 origBuf, err := os.ReadFile(origFN) 5038 dios <- struct{}{} 5039 5040 if err != nil { 5041 return fmt.Errorf("failed to read original block from disk: %w", err) 5042 } 5043 5044 // If the block is encrypted then we will need to decrypt it before 5045 // doing anything. We always encrypt after compressing because then the 5046 // compression can be as efficient as possible on the raw data, whereas 5047 // the encrypted ciphertext will not compress anywhere near as well. 5048 // The block encryption also covers the optional compression metadata. 5049 if mb.bek != nil && len(origBuf) > 0 { 5050 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 5051 if err != nil { 5052 return err 5053 } 5054 mb.bek = bek 5055 mb.bek.XORKeyStream(origBuf, origBuf) 5056 } 5057 5058 meta := &CompressionInfo{} 5059 if _, err := meta.UnmarshalMetadata(origBuf); err != nil { 5060 // An error is only returned here if there's a problem with parsing 5061 // the metadata. If the file has no metadata at all, no error is 5062 // returned and the algorithm defaults to no compression. 5063 return fmt.Errorf("failed to read existing metadata header: %w", err) 5064 } 5065 if meta.Algorithm == alg { 5066 // The block is already compressed with the chosen algorithm so there 5067 // is nothing else to do. This is not a common case, it is here only 5068 // to ensure we don't do unnecessary work in case something asked us 5069 // to recompress an already compressed block with the same algorithm. 5070 return nil 5071 } else if alg != NoCompression { 5072 // The block is already compressed using some algorithm, so we need 5073 // to decompress the block using the existing algorithm before we can 5074 // recompress it with the new one. 5075 if origBuf, err = meta.Algorithm.Decompress(origBuf); err != nil { 5076 return fmt.Errorf("failed to decompress original block: %w", err) 5077 } 5078 } 5079 5080 // Rather than modifying the existing block on disk (which is a dangerous 5081 // operation if something goes wrong), create a new temporary file. We will 5082 // write out the new block here and then swap the files around afterwards 5083 // once everything else has succeeded correctly. 5084 <-dios 5085 tmpFD, err := os.OpenFile(tmpFN, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, defaultFilePerms) 5086 dios <- struct{}{} 5087 if err != nil { 5088 return fmt.Errorf("failed to create temporary file: %w", err) 5089 } 5090 5091 // The original buffer at this point is uncompressed, so we will now compress 5092 // it if needed. Note that if the selected algorithm is NoCompression, the 5093 // Compress function will just return the input buffer unmodified. 5094 cmpBuf, err := alg.Compress(origBuf) 5095 if err != nil { 5096 return fmt.Errorf("failed to compress block: %w", err) 5097 } 5098 5099 // We only need to write out the metadata header if compression is enabled. 5100 // If we're trying to uncompress the file on disk at this point, don't bother 5101 // writing metadata. 5102 if alg != NoCompression { 5103 meta := &CompressionInfo{ 5104 Algorithm: alg, 5105 OriginalSize: uint64(len(origBuf)), 5106 } 5107 cmpBuf = append(meta.MarshalMetadata(), cmpBuf...) 5108 } 5109 5110 // Re-encrypt the block if necessary. 5111 if mb.bek != nil && len(cmpBuf) > 0 { 5112 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 5113 if err != nil { 5114 return err 5115 } 5116 mb.bek = bek 5117 mb.bek.XORKeyStream(cmpBuf, cmpBuf) 5118 } 5119 5120 // Write the new block data (which might be compressed or encrypted) to the 5121 // temporary file. 5122 errorCleanup := func(err error) error { 5123 tmpFD.Close() 5124 os.Remove(tmpFN) 5125 return err 5126 } 5127 if n, err := tmpFD.Write(cmpBuf); err != nil { 5128 return errorCleanup(fmt.Errorf("failed to write to temporary file: %w", err)) 5129 } else if n != len(cmpBuf) { 5130 return errorCleanup(fmt.Errorf("short write to temporary file (%d != %d)", n, len(cmpBuf))) 5131 } 5132 if err := tmpFD.Sync(); err != nil { 5133 return errorCleanup(fmt.Errorf("failed to sync temporary file: %w", err)) 5134 } 5135 if err := tmpFD.Close(); err != nil { 5136 return errorCleanup(fmt.Errorf("failed to close temporary file: %w", err)) 5137 } 5138 5139 // Now replace the original file with the newly updated temp file. 5140 if err := os.Rename(tmpFN, origFN); err != nil { 5141 return fmt.Errorf("failed to move temporary file into place: %w", err) 5142 } 5143 5144 // Since the message block might be retained in memory, make sure the 5145 // compression algorithm is up-to-date, since this will be needed when 5146 // compacting or truncating. 5147 mb.cmp = alg 5148 return nil 5149 } 5150 5151 func (mb *msgBlock) decompressIfNeeded(buf []byte) ([]byte, error) { 5152 var meta CompressionInfo 5153 if n, err := meta.UnmarshalMetadata(buf); err != nil { 5154 // There was a problem parsing the metadata header of the block. 5155 // If there's no metadata header, an error isn't returned here, 5156 // we will instead just use default values of no compression. 5157 return nil, err 5158 } else if n == 0 { 5159 // There were no metadata bytes, so we assume the block is not 5160 // compressed and return it as-is. 5161 return buf, nil 5162 } else { 5163 // Metadata was present so it's quite likely the block contents 5164 // are compressed. If by any chance the metadata claims that the 5165 // block is uncompressed, then the input slice is just returned 5166 // unmodified. 5167 return meta.Algorithm.Decompress(buf[n:]) 5168 } 5169 } 5170 5171 // Lock should be held. 5172 func (mb *msgBlock) ensureRawBytesLoaded() error { 5173 if mb.rbytes > 0 { 5174 return nil 5175 } 5176 f, err := mb.openBlock() 5177 if err != nil { 5178 return err 5179 } 5180 defer f.Close() 5181 if fi, err := f.Stat(); fi != nil && err == nil { 5182 mb.rbytes = uint64(fi.Size()) 5183 } else { 5184 return err 5185 } 5186 return nil 5187 } 5188 5189 // Sync msg and index files as needed. This is called from a timer. 5190 func (fs *fileStore) syncBlocks() { 5191 fs.mu.RLock() 5192 if fs.closed { 5193 fs.mu.RUnlock() 5194 return 5195 } 5196 blks := append([]*msgBlock(nil), fs.blks...) 5197 lmb := fs.lmb 5198 syncInterval := fs.fcfg.SyncInterval 5199 fs.mu.RUnlock() 5200 5201 var markDirty bool 5202 for _, mb := range blks { 5203 // Do actual sync. Hold lock for consistency. 5204 mb.mu.Lock() 5205 if mb.closed { 5206 mb.mu.Unlock() 5207 continue 5208 } 5209 // See if we can close FDs due to being idle. 5210 if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle { 5211 mb.dirtyCloseWithRemove(false) 5212 } 5213 // Check our fss subject metadata. 5214 // If we have no activity within sync interval remove. 5215 if mb.fssLoaded() && mb.sinceLastActivity() > syncInterval { 5216 mb.fss = nil 5217 } 5218 5219 // Check if we should compact here as well. 5220 // Do not compact last mb. 5221 var needsCompact bool 5222 if mb != lmb && mb.ensureRawBytesLoaded() == nil && mb.rbytes > mb.bytes { 5223 needsCompact = true 5224 markDirty = true 5225 } 5226 5227 // Check if we need to sync. We will not hold lock during actual sync. 5228 needSync := mb.needSync 5229 if needSync { 5230 // Flush anything that may be pending. 5231 mb.flushPendingMsgsLocked() 5232 } 5233 mb.mu.Unlock() 5234 5235 // Check if we should compact here. 5236 // Need to hold fs lock in case we reference psim when loading in the mb. 5237 if needsCompact { 5238 fs.mu.RLock() 5239 mb.mu.Lock() 5240 mb.compact() 5241 mb.mu.Unlock() 5242 fs.mu.RUnlock() 5243 } 5244 5245 // Check if we need to sync this block. 5246 if needSync { 5247 mb.mu.Lock() 5248 var fd *os.File 5249 var didOpen bool 5250 if mb.mfd != nil { 5251 fd = mb.mfd 5252 } else { 5253 <-dios 5254 fd, _ = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms) 5255 dios <- struct{}{} 5256 didOpen = true 5257 } 5258 // If we have an fd. 5259 if fd != nil { 5260 canClear := fd.Sync() == nil 5261 // If we opened the file close the fd. 5262 if didOpen { 5263 fd.Close() 5264 } 5265 // Only clear sync flag on success. 5266 if canClear { 5267 mb.needSync = false 5268 } 5269 } 5270 mb.mu.Unlock() 5271 } 5272 } 5273 5274 fs.mu.Lock() 5275 if fs.closed { 5276 fs.mu.Unlock() 5277 return 5278 } 5279 fs.setSyncTimer() 5280 if markDirty { 5281 fs.dirty++ 5282 } 5283 5284 // Sync state file if we are not running with sync always. 5285 if !fs.fcfg.SyncAlways { 5286 fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) 5287 <-dios 5288 fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms) 5289 dios <- struct{}{} 5290 if fd != nil { 5291 fd.Sync() 5292 fd.Close() 5293 } 5294 } 5295 fs.mu.Unlock() 5296 } 5297 5298 // Select the message block where this message should be found. 5299 // Return nil if not in the set. 5300 // Read lock should be held. 5301 func (fs *fileStore) selectMsgBlock(seq uint64) *msgBlock { 5302 _, mb := fs.selectMsgBlockWithIndex(seq) 5303 return mb 5304 } 5305 5306 // Lock should be held. 5307 func (fs *fileStore) selectMsgBlockWithIndex(seq uint64) (int, *msgBlock) { 5308 // Check for out of range. 5309 if seq < fs.state.FirstSeq || seq > fs.state.LastSeq || fs.state.Msgs == 0 { 5310 return -1, nil 5311 } 5312 5313 const linearThresh = 32 5314 nb := len(fs.blks) - 1 5315 5316 if nb < linearThresh { 5317 for i, mb := range fs.blks { 5318 if seq <= atomic.LoadUint64(&mb.last.seq) { 5319 return i, mb 5320 } 5321 } 5322 return -1, nil 5323 } 5324 5325 // Do traditional binary search here since we know the blocks are sorted by sequence first and last. 5326 for low, high, mid := 0, nb, nb/2; low <= high; mid = (low + high) / 2 { 5327 mb := fs.blks[mid] 5328 // Right now these atomic loads do not factor in, so fine to leave. Was considering 5329 // uplifting these to fs scope to avoid atomic load but not needed. 5330 first, last := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 5331 if seq > last { 5332 low = mid + 1 5333 } else if seq < first { 5334 // A message block's first sequence can change here meaning we could find a gap. 5335 // We want to behave like above, which if inclusive (we check at start) should 5336 // always return an index and a valid mb. 5337 // If we have a gap then our seq would be > fs.blks[mid-1].last.seq 5338 if mid == 0 || seq > atomic.LoadUint64(&fs.blks[mid-1].last.seq) { 5339 return mid, mb 5340 } 5341 high = mid - 1 5342 } else { 5343 return mid, mb 5344 } 5345 } 5346 5347 return -1, nil 5348 } 5349 5350 // Select the message block where this message should be found. 5351 // Return nil if not in the set. 5352 func (fs *fileStore) selectMsgBlockForStart(minTime time.Time) *msgBlock { 5353 fs.mu.RLock() 5354 defer fs.mu.RUnlock() 5355 5356 t := minTime.UnixNano() 5357 for _, mb := range fs.blks { 5358 mb.mu.RLock() 5359 found := t <= mb.last.ts 5360 mb.mu.RUnlock() 5361 if found { 5362 return mb 5363 } 5364 } 5365 return nil 5366 } 5367 5368 // Index a raw msg buffer. 5369 // Lock should be held. 5370 func (mb *msgBlock) indexCacheBuf(buf []byte) error { 5371 var le = binary.LittleEndian 5372 5373 var fseq uint64 5374 var idx []uint32 5375 var index uint32 5376 5377 mbFirstSeq := atomic.LoadUint64(&mb.first.seq) 5378 mbLastSeq := atomic.LoadUint64(&mb.last.seq) 5379 5380 // Sanity check here since we calculate size to allocate based on this. 5381 if mbFirstSeq > (mbLastSeq + 1) { // Purged state first == last + 1 5382 mb.fs.warn("indexCacheBuf corrupt state: mb.first %d mb.last %d", mbFirstSeq, mbLastSeq) 5383 // This would cause idxSz to wrap. 5384 return errCorruptState 5385 } 5386 5387 // Capture beginning size of dmap. 5388 dms := uint64(mb.dmap.Size()) 5389 idxSz := mbLastSeq - mbFirstSeq + 1 5390 5391 if mb.cache == nil { 5392 // Approximation, may adjust below. 5393 fseq = mbFirstSeq 5394 idx = make([]uint32, 0, idxSz) 5395 mb.cache = &cache{} 5396 } else { 5397 fseq = mb.cache.fseq 5398 idx = mb.cache.idx 5399 if len(idx) == 0 { 5400 idx = make([]uint32, 0, idxSz) 5401 } 5402 index = uint32(len(mb.cache.buf)) 5403 buf = append(mb.cache.buf, buf...) 5404 } 5405 5406 // Create FSS if we should track. 5407 var popFss bool 5408 if mb.fssNotLoaded() { 5409 mb.fss = make(map[string]*SimpleState) 5410 popFss = true 5411 } 5412 5413 lbuf := uint32(len(buf)) 5414 var seq uint64 5415 for index < lbuf { 5416 if index+msgHdrSize > lbuf { 5417 return errCorruptState 5418 } 5419 hdr := buf[index : index+msgHdrSize] 5420 rl, slen := le.Uint32(hdr[0:]), int(le.Uint16(hdr[20:])) 5421 seq = le.Uint64(hdr[4:]) 5422 5423 // Clear any headers bit that could be set. 5424 rl &^= hbit 5425 dlen := int(rl) - msgHdrSize 5426 5427 // Do some quick sanity checks here. 5428 if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { 5429 mb.fs.warn("indexCacheBuf corrupt record state: dlen %d slen %d index %d rl %d lbuf %d", dlen, slen, index, rl, lbuf) 5430 // This means something is off. 5431 // TODO(dlc) - Add into bad list? 5432 return errCorruptState 5433 } 5434 5435 // Check for tombstones which we can skip in terms of indexing. 5436 if seq&tbit != 0 { 5437 index += rl 5438 continue 5439 } 5440 5441 // Clear any erase bits. 5442 erased := seq&ebit != 0 5443 seq = seq &^ ebit 5444 5445 // We defer checksum checks to individual msg cache lookups to amortorize costs and 5446 // not introduce latency for first message from a newly loaded block. 5447 if seq >= mbFirstSeq { 5448 // Track that we do not have holes. 5449 if slot := int(seq - mbFirstSeq); slot != len(idx) { 5450 // If we have a hole fill it. 5451 for dseq := mbFirstSeq + uint64(len(idx)); dseq < seq; dseq++ { 5452 idx = append(idx, dbit) 5453 if dms == 0 { 5454 mb.dmap.Insert(dseq) 5455 } 5456 } 5457 } 5458 // Add to our index. 5459 idx = append(idx, index) 5460 mb.cache.lrl = uint32(rl) 5461 // Adjust if we guessed wrong. 5462 if seq != 0 && seq < fseq { 5463 fseq = seq 5464 } 5465 5466 // Make sure our dmap has this entry if it was erased. 5467 if erased && dms == 0 { 5468 mb.dmap.Insert(seq) 5469 } 5470 5471 // Handle FSS inline here. 5472 if popFss && slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) { 5473 bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)] 5474 if ss := mb.fss[string(bsubj)]; ss != nil { 5475 ss.Msgs++ 5476 ss.Last = seq 5477 } else { 5478 mb.fss[string(bsubj)] = &SimpleState{ 5479 Msgs: 1, 5480 First: seq, 5481 Last: seq, 5482 } 5483 } 5484 } 5485 } 5486 index += rl 5487 } 5488 5489 // Track holes at the end of the block, these would be missed in the 5490 // earlier loop if we've ran out of block file to look at, but should 5491 // be easily noticed because the seq will be below the last seq from 5492 // the index. 5493 if seq > 0 && seq < mbLastSeq { 5494 for dseq := seq; dseq < mbLastSeq; dseq++ { 5495 idx = append(idx, dbit) 5496 if dms == 0 { 5497 mb.dmap.Insert(dseq) 5498 } 5499 } 5500 } 5501 5502 mb.cache.buf = buf 5503 mb.cache.idx = idx 5504 mb.cache.fseq = fseq 5505 mb.cache.wp += int(lbuf) 5506 5507 return nil 5508 } 5509 5510 // flushPendingMsgs writes out any messages for this message block. 5511 func (mb *msgBlock) flushPendingMsgs() error { 5512 mb.mu.Lock() 5513 fsLostData, err := mb.flushPendingMsgsLocked() 5514 fs := mb.fs 5515 mb.mu.Unlock() 5516 5517 // Signals us that we need to rebuild filestore state. 5518 if fsLostData != nil && fs != nil { 5519 // Rebuild fs state too. 5520 fs.rebuildState(fsLostData) 5521 } 5522 return err 5523 } 5524 5525 // Write function for actual data. 5526 // mb.mfd should not be nil. 5527 // Lock should held. 5528 func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) { 5529 // Used to mock write failures. 5530 if mb.mockWriteErr { 5531 // Reset on trip. 5532 mb.mockWriteErr = false 5533 return 0, errors.New("mock write error") 5534 } 5535 <-dios 5536 n, err := mb.mfd.WriteAt(buf, woff) 5537 dios <- struct{}{} 5538 return n, err 5539 } 5540 5541 // flushPendingMsgsLocked writes out any messages for this message block. 5542 // Lock should be held. 5543 func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { 5544 // Signals us that we need to rebuild filestore state. 5545 var fsLostData *LostStreamData 5546 5547 if mb.cache == nil || mb.mfd == nil { 5548 return nil, nil 5549 } 5550 5551 buf, err := mb.bytesPending() 5552 // If we got an error back return here. 5553 if err != nil { 5554 // No pending data to be written is not an error. 5555 if err == errNoPending || err == errNoCache { 5556 err = nil 5557 } 5558 return nil, err 5559 } 5560 5561 woff := int64(mb.cache.off + mb.cache.wp) 5562 lob := len(buf) 5563 5564 // TODO(dlc) - Normally we would not hold the lock across I/O so we can improve performance. 5565 // We will hold to stabilize the code base, as we have had a few anomalies with partial cache errors 5566 // under heavy load. 5567 5568 // Check if we need to encrypt. 5569 if mb.bek != nil && lob > 0 { 5570 // Need to leave original alone. 5571 var dst []byte 5572 if lob <= defaultLargeBlockSize { 5573 dst = getMsgBlockBuf(lob)[:lob] 5574 } else { 5575 dst = make([]byte, lob) 5576 } 5577 mb.bek.XORKeyStream(dst, buf) 5578 buf = dst 5579 } 5580 5581 // Append new data to the message block file. 5582 for lbb := lob; lbb > 0; lbb = len(buf) { 5583 n, err := mb.writeAt(buf, woff) 5584 if err != nil { 5585 mb.dirtyCloseWithRemove(false) 5586 ld, _, _ := mb.rebuildStateLocked() 5587 mb.werr = err 5588 return ld, err 5589 } 5590 // Update our write offset. 5591 woff += int64(n) 5592 // Partial write. 5593 if n != lbb { 5594 buf = buf[n:] 5595 } else { 5596 // Done. 5597 break 5598 } 5599 } 5600 5601 // Clear any error. 5602 mb.werr = nil 5603 5604 // Cache may be gone. 5605 if mb.cache == nil || mb.mfd == nil { 5606 return fsLostData, mb.werr 5607 } 5608 5609 // Check if we are in sync always mode. 5610 if mb.syncAlways { 5611 mb.mfd.Sync() 5612 } else { 5613 mb.needSync = true 5614 } 5615 5616 // Check for additional writes while we were writing to the disk. 5617 moreBytes := len(mb.cache.buf) - mb.cache.wp - lob 5618 5619 // Decide what we want to do with the buffer in hand. If we have load interest 5620 // we will hold onto the whole thing, otherwise empty the buffer, possibly reusing it. 5621 if ts := time.Now().UnixNano(); ts < mb.llts || (ts-mb.llts) <= int64(mb.cexp) { 5622 mb.cache.wp += lob 5623 } else { 5624 if cap(mb.cache.buf) <= maxBufReuse { 5625 buf = mb.cache.buf[:0] 5626 } else { 5627 recycleMsgBlockBuf(mb.cache.buf) 5628 buf = nil 5629 } 5630 if moreBytes > 0 { 5631 nbuf := mb.cache.buf[len(mb.cache.buf)-moreBytes:] 5632 if moreBytes > (len(mb.cache.buf)/4*3) && cap(nbuf) <= maxBufReuse { 5633 buf = nbuf 5634 } else { 5635 buf = append(buf, nbuf...) 5636 } 5637 } 5638 // Update our cache offset. 5639 mb.cache.off = int(woff) 5640 // Reset write pointer. 5641 mb.cache.wp = 0 5642 // Place buffer back in the cache structure. 5643 mb.cache.buf = buf 5644 // Mark fseq to 0 5645 mb.cache.fseq = 0 5646 } 5647 5648 return fsLostData, mb.werr 5649 } 5650 5651 // Lock should be held. 5652 func (mb *msgBlock) clearLoading() { 5653 mb.loading = false 5654 } 5655 5656 // Will load msgs from disk. 5657 func (mb *msgBlock) loadMsgs() error { 5658 // We hold the lock here the whole time by design. 5659 mb.mu.Lock() 5660 defer mb.mu.Unlock() 5661 return mb.loadMsgsWithLock() 5662 } 5663 5664 // Lock should be held. 5665 func (mb *msgBlock) cacheAlreadyLoaded() bool { 5666 if mb.cache == nil || mb.cache.off != 0 || mb.cache.fseq == 0 || len(mb.cache.buf) == 0 { 5667 return false 5668 } 5669 numEntries := mb.msgs + uint64(mb.dmap.Size()) + (atomic.LoadUint64(&mb.first.seq) - mb.cache.fseq) 5670 return numEntries == uint64(len(mb.cache.idx)) 5671 } 5672 5673 // Lock should be held. 5674 func (mb *msgBlock) cacheNotLoaded() bool { 5675 return !mb.cacheAlreadyLoaded() 5676 } 5677 5678 // Report if our fss is not loaded. 5679 // Lock should be held. 5680 func (mb *msgBlock) fssNotLoaded() bool { 5681 return mb.fss == nil && !mb.noTrack 5682 } 5683 5684 // Report if we have our fss loaded. 5685 // Lock should be held. 5686 func (mb *msgBlock) fssLoaded() bool { 5687 return mb.fss != nil 5688 } 5689 5690 // Wrap openBlock for the gated semaphore processing. 5691 // Lock should be held 5692 func (mb *msgBlock) openBlock() (*os.File, error) { 5693 // Gate with concurrent IO semaphore. 5694 <-dios 5695 f, err := os.Open(mb.mfn) 5696 dios <- struct{}{} 5697 return f, err 5698 } 5699 5700 // Used to load in the block contents. 5701 // Lock should be held and all conditionals satisfied prior. 5702 func (mb *msgBlock) loadBlock(buf []byte) ([]byte, error) { 5703 var f *os.File 5704 // Re-use if we have mfd open. 5705 if mb.mfd != nil { 5706 f = mb.mfd 5707 if n, err := f.Seek(0, 0); n != 0 || err != nil { 5708 f = nil 5709 mb.closeFDsLockedNoCheck() 5710 } 5711 } 5712 if f == nil { 5713 var err error 5714 f, err = mb.openBlock() 5715 if err != nil { 5716 if os.IsNotExist(err) { 5717 err = errNoBlkData 5718 } 5719 return nil, err 5720 } 5721 defer f.Close() 5722 } 5723 5724 var sz int 5725 if info, err := f.Stat(); err == nil { 5726 sz64 := info.Size() 5727 if int64(int(sz64)) == sz64 { 5728 sz = int(sz64) 5729 } else { 5730 return nil, errMsgBlkTooBig 5731 } 5732 } 5733 5734 if buf == nil { 5735 buf = getMsgBlockBuf(sz) 5736 if sz > cap(buf) { 5737 // We know we will make a new one so just recycle for now. 5738 recycleMsgBlockBuf(buf) 5739 buf = nil 5740 } 5741 } 5742 5743 if sz > cap(buf) { 5744 buf = make([]byte, sz) 5745 } else { 5746 buf = buf[:sz] 5747 } 5748 5749 <-dios 5750 n, err := io.ReadFull(f, buf) 5751 dios <- struct{}{} 5752 // On success capture raw bytes size. 5753 if err == nil { 5754 mb.rbytes = uint64(n) 5755 } 5756 return buf[:n], err 5757 } 5758 5759 // Lock should be held. 5760 func (mb *msgBlock) loadMsgsWithLock() error { 5761 // Check for encryption, we do not load keys on startup anymore so might need to load them here. 5762 if mb.fs != nil && mb.fs.prf != nil && (mb.aek == nil || mb.bek == nil) { 5763 if err := mb.fs.loadEncryptionForMsgBlock(mb); err != nil { 5764 return err 5765 } 5766 } 5767 5768 // Check to see if we are loading already. 5769 if mb.loading { 5770 return nil 5771 } 5772 5773 // Set loading status. 5774 mb.loading = true 5775 defer mb.clearLoading() 5776 5777 var nchecks int 5778 5779 checkCache: 5780 nchecks++ 5781 if nchecks > 8 { 5782 return errCorruptState 5783 } 5784 5785 // Check to see if we have a full cache. 5786 if mb.cacheAlreadyLoaded() { 5787 return nil 5788 } 5789 5790 mb.llts = time.Now().UnixNano() 5791 5792 // FIXME(dlc) - We could be smarter here. 5793 if buf, _ := mb.bytesPending(); len(buf) > 0 { 5794 ld, err := mb.flushPendingMsgsLocked() 5795 if ld != nil && mb.fs != nil { 5796 // We do not know if fs is locked or not at this point. 5797 // This should be an exceptional condition so do so in Go routine. 5798 go mb.fs.rebuildState(ld) 5799 } 5800 if err != nil { 5801 return err 5802 } 5803 goto checkCache 5804 } 5805 5806 // Load in the whole block. 5807 // We want to hold the mb lock here to avoid any changes to state. 5808 buf, err := mb.loadBlock(nil) 5809 if err != nil { 5810 mb.fs.warn("loadBlock error: ", err) 5811 if err == errNoBlkData { 5812 if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil { 5813 // Rebuild fs state too. 5814 go mb.fs.rebuildState(ld) 5815 } 5816 } 5817 return err 5818 } 5819 5820 // Reset the cache since we just read everything in. 5821 // Make sure this is cleared in case we had a partial when we started. 5822 mb.clearCacheAndOffset() 5823 5824 // Check if we need to decrypt. 5825 if mb.bek != nil && len(buf) > 0 { 5826 bek, err := genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce) 5827 if err != nil { 5828 return err 5829 } 5830 mb.bek = bek 5831 mb.bek.XORKeyStream(buf, buf) 5832 } 5833 5834 // Check for compression. 5835 if buf, err = mb.decompressIfNeeded(buf); err != nil { 5836 return err 5837 } 5838 5839 if err := mb.indexCacheBuf(buf); err != nil { 5840 if err == errCorruptState { 5841 var ld *LostStreamData 5842 if ld, _, err = mb.rebuildStateLocked(); ld != nil { 5843 // We do not know if fs is locked or not at this point. 5844 // This should be an exceptional condition so do so in Go routine. 5845 go mb.fs.rebuildState(ld) 5846 } 5847 } 5848 if err != nil { 5849 return err 5850 } 5851 goto checkCache 5852 } 5853 5854 if len(buf) > 0 { 5855 mb.cloads++ 5856 mb.startCacheExpireTimer() 5857 } 5858 5859 return nil 5860 } 5861 5862 // Fetch a message from this block, possibly reading in and caching the messages. 5863 // We assume the block was selected and is correct, so we do not do range checks. 5864 func (mb *msgBlock) fetchMsg(seq uint64, sm *StoreMsg) (*StoreMsg, bool, error) { 5865 mb.mu.Lock() 5866 defer mb.mu.Unlock() 5867 5868 fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 5869 if seq < fseq || seq > lseq { 5870 return nil, false, ErrStoreMsgNotFound 5871 } 5872 5873 // See if we can short circuit if we already know msg deleted. 5874 if mb.dmap.Exists(seq) { 5875 // Update for scanning like cacheLookup would have. 5876 llseq := mb.llseq 5877 if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 { 5878 mb.llseq = seq 5879 } 5880 expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1) 5881 return nil, expireOk, errDeletedMsg 5882 } 5883 5884 if mb.cacheNotLoaded() { 5885 if err := mb.loadMsgsWithLock(); err != nil { 5886 return nil, false, err 5887 } 5888 } 5889 llseq := mb.llseq 5890 5891 fsm, err := mb.cacheLookup(seq, sm) 5892 if err != nil { 5893 return nil, false, err 5894 } 5895 expireOk := (seq == lseq && llseq == seq-1) || (seq == fseq && llseq == seq+1) 5896 return fsm, expireOk, err 5897 } 5898 5899 var ( 5900 errNoCache = errors.New("no message cache") 5901 errBadMsg = errors.New("malformed or corrupt message") 5902 errDeletedMsg = errors.New("deleted message") 5903 errPartialCache = errors.New("partial cache") 5904 errNoPending = errors.New("message block does not have pending data") 5905 errNotReadable = errors.New("storage directory not readable") 5906 errCorruptState = errors.New("corrupt state file") 5907 errPriorState = errors.New("prior state file") 5908 errPendingData = errors.New("pending data still present") 5909 errNoEncryption = errors.New("encryption not enabled") 5910 errBadKeySize = errors.New("encryption bad key size") 5911 errNoMsgBlk = errors.New("no message block") 5912 errMsgBlkTooBig = errors.New("message block size exceeded int capacity") 5913 errUnknownCipher = errors.New("unknown cipher") 5914 errNoMainKey = errors.New("encrypted store encountered with no main key") 5915 errNoBlkData = errors.New("message block data missing") 5916 errStateTooBig = errors.New("store state too big for optional write") 5917 ) 5918 5919 const ( 5920 // Used for marking messages that have had their checksums checked. 5921 // Used to signal a message record with headers. 5922 hbit = 1 << 31 5923 // Used for marking erased messages sequences. 5924 ebit = 1 << 63 5925 // Used for marking tombstone sequences. 5926 tbit = 1 << 62 5927 // Used to mark an index as deleted and non-existent. 5928 dbit = 1 << 30 5929 ) 5930 5931 // Will do a lookup from cache. 5932 // Lock should be held. 5933 func (mb *msgBlock) cacheLookup(seq uint64, sm *StoreMsg) (*StoreMsg, error) { 5934 if seq < atomic.LoadUint64(&mb.first.seq) || seq > atomic.LoadUint64(&mb.last.seq) { 5935 return nil, ErrStoreMsgNotFound 5936 } 5937 5938 // The llseq signals us when we can expire a cache at the end of a linear scan. 5939 // We want to only update when we know the last reads (multiple consumers) are sequential. 5940 // We want to account for forwards and backwards linear scans. 5941 if mb.llseq == 0 || seq < mb.llseq || seq == mb.llseq+1 || seq == mb.llseq-1 { 5942 mb.llseq = seq 5943 } 5944 5945 // If we have a delete map check it. 5946 if mb.dmap.Exists(seq) { 5947 mb.llts = time.Now().UnixNano() 5948 return nil, errDeletedMsg 5949 } 5950 5951 // Detect no cache loaded. 5952 if mb.cache == nil || mb.cache.fseq == 0 || len(mb.cache.idx) == 0 || len(mb.cache.buf) == 0 { 5953 var reason string 5954 if mb.cache == nil { 5955 reason = "no cache" 5956 } else if mb.cache.fseq == 0 { 5957 reason = "fseq is 0" 5958 } else if len(mb.cache.idx) == 0 { 5959 reason = "no idx present" 5960 } else { 5961 reason = "cache buf empty" 5962 } 5963 mb.fs.warn("Cache lookup detected no cache: %s", reason) 5964 return nil, errNoCache 5965 } 5966 // Check partial cache status. 5967 if seq < mb.cache.fseq { 5968 mb.fs.warn("Cache lookup detected partial cache: seq %d vs cache fseq %d", seq, mb.cache.fseq) 5969 return nil, errPartialCache 5970 } 5971 5972 bi, _, hashChecked, err := mb.slotInfo(int(seq - mb.cache.fseq)) 5973 if err != nil { 5974 return nil, err 5975 } 5976 5977 // Update cache activity. 5978 mb.llts = time.Now().UnixNano() 5979 5980 li := int(bi) - mb.cache.off 5981 if li >= len(mb.cache.buf) { 5982 return nil, errPartialCache 5983 } 5984 buf := mb.cache.buf[li:] 5985 5986 // We use the high bit to denote we have already checked the checksum. 5987 var hh hash.Hash64 5988 if !hashChecked { 5989 hh = mb.hh // This will force the hash check in msgFromBuf. 5990 } 5991 5992 // Parse from the raw buffer. 5993 fsm, err := mb.msgFromBuf(buf, sm, hh) 5994 if err != nil || fsm == nil { 5995 return nil, err 5996 } 5997 5998 // Deleted messages that are decoded return a 0 for sequence. 5999 if fsm.seq == 0 { 6000 return nil, errDeletedMsg 6001 } 6002 6003 if seq != fsm.seq { 6004 recycleMsgBlockBuf(mb.cache.buf) 6005 mb.cache.buf = nil 6006 return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq) 6007 } 6008 6009 // Clear the check bit here after we know all is good. 6010 if !hashChecked { 6011 mb.cache.idx[seq-mb.cache.fseq] = (bi | hbit) 6012 } 6013 6014 return fsm, nil 6015 } 6016 6017 // Used when we are checking if discarding a message due to max msgs per subject will give us 6018 // enough room for a max bytes condition. 6019 // Lock should be already held. 6020 func (fs *fileStore) sizeForSeq(seq uint64) int { 6021 if seq == 0 { 6022 return 0 6023 } 6024 var smv StoreMsg 6025 if mb := fs.selectMsgBlock(seq); mb != nil { 6026 if sm, _, _ := mb.fetchMsg(seq, &smv); sm != nil { 6027 return int(fileStoreMsgSize(sm.subj, sm.hdr, sm.msg)) 6028 } 6029 } 6030 return 0 6031 } 6032 6033 // Will return message for the given sequence number. 6034 func (fs *fileStore) msgForSeq(seq uint64, sm *StoreMsg) (*StoreMsg, error) { 6035 // TODO(dlc) - Since Store, Remove, Skip all hold the write lock on fs this will 6036 // be stalled. Need another lock if want to happen in parallel. 6037 fs.mu.RLock() 6038 if fs.closed { 6039 fs.mu.RUnlock() 6040 return nil, ErrStoreClosed 6041 } 6042 // Indicates we want first msg. 6043 if seq == 0 { 6044 seq = fs.state.FirstSeq 6045 } 6046 // Make sure to snapshot here. 6047 mb, lseq := fs.selectMsgBlock(seq), fs.state.LastSeq 6048 fs.mu.RUnlock() 6049 6050 if mb == nil { 6051 var err = ErrStoreEOF 6052 if seq <= lseq { 6053 err = ErrStoreMsgNotFound 6054 } 6055 return nil, err 6056 } 6057 6058 fsm, expireOk, err := mb.fetchMsg(seq, sm) 6059 if err != nil { 6060 return nil, err 6061 } 6062 6063 // We detected a linear scan and access to the last message. 6064 // If we are not the last message block we can try to expire the cache. 6065 if expireOk { 6066 mb.tryForceExpireCache() 6067 } 6068 6069 return fsm, nil 6070 } 6071 6072 // Internal function to return msg parts from a raw buffer. 6073 // Lock should be held. 6074 func (mb *msgBlock) msgFromBuf(buf []byte, sm *StoreMsg, hh hash.Hash64) (*StoreMsg, error) { 6075 if len(buf) < emptyRecordLen { 6076 return nil, errBadMsg 6077 } 6078 var le = binary.LittleEndian 6079 6080 hdr := buf[:msgHdrSize] 6081 rl := le.Uint32(hdr[0:]) 6082 hasHeaders := rl&hbit != 0 6083 rl &^= hbit // clear header bit 6084 dlen := int(rl) - msgHdrSize 6085 slen := int(le.Uint16(hdr[20:])) 6086 // Simple sanity check. 6087 if dlen < 0 || slen > (dlen-recordHashSize) || dlen > int(rl) || int(rl) > len(buf) { 6088 return nil, errBadMsg 6089 } 6090 data := buf[msgHdrSize : msgHdrSize+dlen] 6091 // Do checksum tests here if requested. 6092 if hh != nil { 6093 hh.Reset() 6094 hh.Write(hdr[4:20]) 6095 hh.Write(data[:slen]) 6096 if hasHeaders { 6097 hh.Write(data[slen+4 : dlen-recordHashSize]) 6098 } else { 6099 hh.Write(data[slen : dlen-recordHashSize]) 6100 } 6101 if !bytes.Equal(hh.Sum(nil), data[len(data)-8:]) { 6102 return nil, errBadMsg 6103 } 6104 } 6105 seq := le.Uint64(hdr[4:]) 6106 if seq&ebit != 0 { 6107 seq = 0 6108 } 6109 ts := int64(le.Uint64(hdr[12:])) 6110 6111 // Create a StoreMsg if needed. 6112 if sm == nil { 6113 sm = new(StoreMsg) 6114 } else { 6115 sm.clear() 6116 } 6117 // To recycle the large blocks we can never pass back a reference, so need to copy for the upper 6118 // layers and for us to be safe to expire, and recycle, the large msgBlocks. 6119 end := dlen - 8 6120 6121 if hasHeaders { 6122 hl := le.Uint32(data[slen:]) 6123 bi := slen + 4 6124 li := bi + int(hl) 6125 sm.buf = append(sm.buf, data[bi:end]...) 6126 li, end = li-bi, end-bi 6127 sm.hdr = sm.buf[0:li:li] 6128 sm.msg = sm.buf[li:end] 6129 } else { 6130 sm.buf = append(sm.buf, data[slen:end]...) 6131 sm.msg = sm.buf[0 : end-slen] 6132 } 6133 sm.seq, sm.ts = seq, ts 6134 if slen > 0 { 6135 // Make a copy since sm.subj lifetime may last longer. 6136 sm.subj = string(data[:slen]) 6137 } 6138 6139 return sm, nil 6140 } 6141 6142 // LoadMsg will lookup the message by sequence number and return it if found. 6143 func (fs *fileStore) LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) { 6144 return fs.msgForSeq(seq, sm) 6145 } 6146 6147 // loadLast will load the last message for a subject. Subject should be non empty and not ">". 6148 func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) { 6149 fs.mu.RLock() 6150 defer fs.mu.RUnlock() 6151 6152 if fs.closed || fs.lmb == nil { 6153 return nil, ErrStoreClosed 6154 } 6155 6156 if len(fs.blks) == 0 { 6157 return nil, ErrStoreMsgNotFound 6158 } 6159 6160 start, stop := fs.lmb.index, fs.blks[0].index 6161 wc := subjectHasWildcard(subj) 6162 // If literal subject check for presence. 6163 if !wc { 6164 if info, ok := fs.psim.Find(stringToBytes(subj)); !ok { 6165 return nil, ErrStoreMsgNotFound 6166 } else { 6167 start, stop = info.lblk, info.fblk 6168 } 6169 } 6170 6171 // Walk blocks backwards. 6172 for i := start; i >= stop; i-- { 6173 mb := fs.bim[i] 6174 if mb == nil { 6175 continue 6176 } 6177 mb.mu.Lock() 6178 if err := mb.ensurePerSubjectInfoLoaded(); err != nil { 6179 mb.mu.Unlock() 6180 return nil, err 6181 } 6182 var l uint64 6183 // Optimize if subject is not a wildcard. 6184 if !wc { 6185 if ss := mb.fss[subj]; ss != nil { 6186 l = ss.Last 6187 } 6188 } 6189 if l == 0 { 6190 _, _, l = mb.filteredPendingLocked(subj, wc, atomic.LoadUint64(&mb.first.seq)) 6191 } 6192 if l > 0 { 6193 if mb.cacheNotLoaded() { 6194 if err := mb.loadMsgsWithLock(); err != nil { 6195 mb.mu.Unlock() 6196 return nil, err 6197 } 6198 } 6199 lsm, err = mb.cacheLookup(l, sm) 6200 } 6201 mb.mu.Unlock() 6202 if l > 0 { 6203 break 6204 } 6205 } 6206 return lsm, err 6207 } 6208 6209 // LoadLastMsg will return the last message we have that matches a given subject. 6210 // The subject can be a wildcard. 6211 func (fs *fileStore) LoadLastMsg(subject string, smv *StoreMsg) (sm *StoreMsg, err error) { 6212 if subject == _EMPTY_ || subject == fwcs { 6213 sm, err = fs.msgForSeq(fs.lastSeq(), smv) 6214 } else { 6215 sm, err = fs.loadLast(subject, smv) 6216 } 6217 if sm == nil || (err != nil && err != ErrStoreClosed) { 6218 err = ErrStoreMsgNotFound 6219 } 6220 return sm, err 6221 } 6222 6223 // LoadNextMsgMulti will find the next message matching any entry in the sublist. 6224 func (fs *fileStore) LoadNextMsgMulti(sl *Sublist, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) { 6225 if sl == nil { 6226 return fs.LoadNextMsg(_EMPTY_, false, start, smp) 6227 } 6228 fs.mu.RLock() 6229 defer fs.mu.RUnlock() 6230 6231 if fs.closed { 6232 return nil, 0, ErrStoreClosed 6233 } 6234 if fs.state.Msgs == 0 { 6235 return nil, fs.state.LastSeq, ErrStoreEOF 6236 } 6237 if start < fs.state.FirstSeq { 6238 start = fs.state.FirstSeq 6239 } 6240 6241 if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 { 6242 for i := bi; i < len(fs.blks); i++ { 6243 mb := fs.blks[i] 6244 if sm, expireOk, err := mb.firstMatchingMulti(sl, start, smp); err == nil { 6245 if expireOk { 6246 mb.tryForceExpireCache() 6247 } 6248 return sm, sm.seq, nil 6249 } else if err != ErrStoreMsgNotFound { 6250 return nil, 0, err 6251 } else if expireOk { 6252 mb.tryForceExpireCache() 6253 } 6254 } 6255 } 6256 6257 return nil, fs.state.LastSeq, ErrStoreEOF 6258 6259 } 6260 6261 func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, uint64, error) { 6262 fs.mu.RLock() 6263 defer fs.mu.RUnlock() 6264 6265 if fs.closed { 6266 return nil, 0, ErrStoreClosed 6267 } 6268 if fs.state.Msgs == 0 { 6269 return nil, fs.state.LastSeq, ErrStoreEOF 6270 } 6271 if start < fs.state.FirstSeq { 6272 start = fs.state.FirstSeq 6273 } 6274 6275 // If start is less than or equal to beginning of our stream, meaning our first call, 6276 // let's check the psim to see if we can skip ahead. 6277 if start <= fs.state.FirstSeq { 6278 var ss SimpleState 6279 fs.numFilteredPending(filter, &ss) 6280 if ss.First > start { 6281 start = ss.First 6282 } 6283 } 6284 6285 if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 { 6286 for i := bi; i < len(fs.blks); i++ { 6287 mb := fs.blks[i] 6288 if sm, expireOk, err := mb.firstMatching(filter, wc, start, sm); err == nil { 6289 if expireOk { 6290 mb.tryForceExpireCache() 6291 } 6292 return sm, sm.seq, nil 6293 } else if err != ErrStoreMsgNotFound { 6294 return nil, 0, err 6295 } else if expireOk { 6296 mb.tryForceExpireCache() 6297 } 6298 } 6299 } 6300 6301 return nil, fs.state.LastSeq, ErrStoreEOF 6302 } 6303 6304 // Type returns the type of the underlying store. 6305 func (fs *fileStore) Type() StorageType { 6306 return FileStorage 6307 } 6308 6309 // Returns number of subjects in this store. 6310 // Lock should be held. 6311 func (fs *fileStore) numSubjects() int { 6312 return fs.psim.Size() 6313 } 6314 6315 // numConsumers uses new lock. 6316 func (fs *fileStore) numConsumers() int { 6317 fs.cmu.RLock() 6318 defer fs.cmu.RUnlock() 6319 return len(fs.cfs) 6320 } 6321 6322 // FastState will fill in state with only the following. 6323 // Msgs, Bytes, First and Last Sequence and Time and NumDeleted. 6324 func (fs *fileStore) FastState(state *StreamState) { 6325 fs.mu.RLock() 6326 state.Msgs = fs.state.Msgs 6327 state.Bytes = fs.state.Bytes 6328 state.FirstSeq = fs.state.FirstSeq 6329 state.FirstTime = fs.state.FirstTime 6330 state.LastSeq = fs.state.LastSeq 6331 state.LastTime = fs.state.LastTime 6332 // Make sure to reset if being re-used. 6333 state.Deleted, state.NumDeleted = nil, 0 6334 if state.LastSeq > state.FirstSeq { 6335 state.NumDeleted = int((state.LastSeq - state.FirstSeq + 1) - state.Msgs) 6336 if state.NumDeleted < 0 { 6337 state.NumDeleted = 0 6338 } 6339 } 6340 state.Consumers = fs.numConsumers() 6341 state.NumSubjects = fs.numSubjects() 6342 fs.mu.RUnlock() 6343 } 6344 6345 // State returns the current state of the stream. 6346 func (fs *fileStore) State() StreamState { 6347 fs.mu.RLock() 6348 state := fs.state 6349 state.Consumers = fs.numConsumers() 6350 state.NumSubjects = fs.numSubjects() 6351 state.Deleted = nil // make sure. 6352 6353 if numDeleted := int((state.LastSeq - state.FirstSeq + 1) - state.Msgs); numDeleted > 0 { 6354 state.Deleted = make([]uint64, 0, numDeleted) 6355 cur := fs.state.FirstSeq 6356 6357 for _, mb := range fs.blks { 6358 mb.mu.Lock() 6359 fseq := atomic.LoadUint64(&mb.first.seq) 6360 // Account for messages missing from the head. 6361 if fseq > cur { 6362 for seq := cur; seq < fseq; seq++ { 6363 state.Deleted = append(state.Deleted, seq) 6364 } 6365 } 6366 cur = atomic.LoadUint64(&mb.last.seq) + 1 // Expected next first. 6367 6368 mb.dmap.Range(func(seq uint64) bool { 6369 if seq < fseq { 6370 mb.dmap.Delete(seq) 6371 } else { 6372 state.Deleted = append(state.Deleted, seq) 6373 } 6374 return true 6375 }) 6376 mb.mu.Unlock() 6377 } 6378 } 6379 fs.mu.RUnlock() 6380 6381 state.Lost = fs.lostData() 6382 6383 // Can not be guaranteed to be sorted. 6384 if len(state.Deleted) > 0 { 6385 sort.Slice(state.Deleted, func(i, j int) bool { 6386 return state.Deleted[i] < state.Deleted[j] 6387 }) 6388 state.NumDeleted = len(state.Deleted) 6389 } 6390 return state 6391 } 6392 6393 func (fs *fileStore) Utilization() (total, reported uint64, err error) { 6394 fs.mu.RLock() 6395 defer fs.mu.RUnlock() 6396 for _, mb := range fs.blks { 6397 mb.mu.RLock() 6398 reported += mb.bytes 6399 total += mb.rbytes 6400 mb.mu.RUnlock() 6401 } 6402 return total, reported, nil 6403 } 6404 6405 func fileStoreMsgSize(subj string, hdr, msg []byte) uint64 { 6406 if len(hdr) == 0 { 6407 // length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + msg + hash(8) 6408 return uint64(22 + len(subj) + len(msg) + 8) 6409 } 6410 // length of the message record (4bytes) + seq(8) + ts(8) + subj_len(2) + subj + hdr_len(4) + hdr + msg + hash(8) 6411 return uint64(22 + len(subj) + 4 + len(hdr) + len(msg) + 8) 6412 } 6413 6414 func fileStoreMsgSizeEstimate(slen, maxPayload int) uint64 { 6415 return uint64(emptyRecordLen + slen + 4 + maxPayload) 6416 } 6417 6418 // Determine time since any last activity, read/load, write or remove. 6419 func (mb *msgBlock) sinceLastActivity() time.Duration { 6420 if mb.closed { 6421 return 0 6422 } 6423 last := mb.lwts 6424 if mb.lrts > last { 6425 last = mb.lrts 6426 } 6427 if mb.llts > last { 6428 last = mb.llts 6429 } 6430 return time.Since(time.Unix(0, last).UTC()) 6431 } 6432 6433 // Determine time since last write or remove of a message. 6434 // Read lock should be held. 6435 func (mb *msgBlock) sinceLastWriteActivity() time.Duration { 6436 if mb.closed { 6437 return 0 6438 } 6439 last := mb.lwts 6440 if mb.lrts > last { 6441 last = mb.lrts 6442 } 6443 return time.Since(time.Unix(0, last).UTC()) 6444 } 6445 6446 func checkNewHeader(hdr []byte) error { 6447 if hdr == nil || len(hdr) < 2 || hdr[0] != magic || 6448 (hdr[1] != version && hdr[1] != newVersion) { 6449 return errCorruptState 6450 } 6451 return nil 6452 } 6453 6454 // readIndexInfo will read in the index information for the message block. 6455 func (mb *msgBlock) readIndexInfo() error { 6456 ifn := filepath.Join(mb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(indexScan, mb.index)) 6457 buf, err := os.ReadFile(ifn) 6458 if err != nil { 6459 return err 6460 } 6461 6462 // Set if first time. 6463 if mb.liwsz == 0 { 6464 mb.liwsz = int64(len(buf)) 6465 } 6466 6467 // Decrypt if needed. 6468 if mb.aek != nil { 6469 buf, err = mb.aek.Open(buf[:0], mb.nonce, buf, nil) 6470 if err != nil { 6471 return err 6472 } 6473 } 6474 6475 if err := checkNewHeader(buf); err != nil { 6476 defer os.Remove(ifn) 6477 return fmt.Errorf("bad index file") 6478 } 6479 6480 bi := hdrLen 6481 6482 // Helpers, will set i to -1 on error. 6483 readSeq := func() uint64 { 6484 if bi < 0 { 6485 return 0 6486 } 6487 seq, n := binary.Uvarint(buf[bi:]) 6488 if n <= 0 { 6489 bi = -1 6490 return 0 6491 } 6492 bi += n 6493 return seq &^ ebit 6494 } 6495 readCount := readSeq 6496 readTimeStamp := func() int64 { 6497 if bi < 0 { 6498 return 0 6499 } 6500 ts, n := binary.Varint(buf[bi:]) 6501 if n <= 0 { 6502 bi = -1 6503 return -1 6504 } 6505 bi += n 6506 return ts 6507 } 6508 mb.msgs = readCount() 6509 mb.bytes = readCount() 6510 atomic.StoreUint64(&mb.first.seq, readSeq()) 6511 mb.first.ts = readTimeStamp() 6512 atomic.StoreUint64(&mb.last.seq, readSeq()) 6513 mb.last.ts = readTimeStamp() 6514 dmapLen := readCount() 6515 6516 // Check if this is a short write index file. 6517 if bi < 0 || bi+checksumSize > len(buf) { 6518 os.Remove(ifn) 6519 return fmt.Errorf("short index file") 6520 } 6521 6522 // Check for consistency if accounting. If something is off bail and we will rebuild. 6523 if mb.msgs != (atomic.LoadUint64(&mb.last.seq)-atomic.LoadUint64(&mb.first.seq)+1)-dmapLen { 6524 os.Remove(ifn) 6525 return fmt.Errorf("accounting inconsistent") 6526 } 6527 6528 // Checksum 6529 copy(mb.lchk[0:], buf[bi:bi+checksumSize]) 6530 bi += checksumSize 6531 6532 // Now check for presence of a delete map 6533 if dmapLen > 0 { 6534 // New version is encoded avl seqset. 6535 if buf[1] == newVersion { 6536 dmap, _, err := avl.Decode(buf[bi:]) 6537 if err != nil { 6538 return fmt.Errorf("could not decode avl dmap: %v", err) 6539 } 6540 mb.dmap = *dmap 6541 } else { 6542 // This is the old version. 6543 for i, fseq := 0, atomic.LoadUint64(&mb.first.seq); i < int(dmapLen); i++ { 6544 seq := readSeq() 6545 if seq == 0 { 6546 break 6547 } 6548 mb.dmap.Insert(seq + fseq) 6549 } 6550 } 6551 } 6552 6553 return nil 6554 } 6555 6556 // Will return total number of cache loads. 6557 func (fs *fileStore) cacheLoads() uint64 { 6558 var tl uint64 6559 fs.mu.RLock() 6560 for _, mb := range fs.blks { 6561 tl += mb.cloads 6562 } 6563 fs.mu.RUnlock() 6564 return tl 6565 } 6566 6567 // Will return total number of cached bytes. 6568 func (fs *fileStore) cacheSize() uint64 { 6569 var sz uint64 6570 fs.mu.RLock() 6571 for _, mb := range fs.blks { 6572 mb.mu.RLock() 6573 if mb.cache != nil { 6574 sz += uint64(len(mb.cache.buf)) 6575 } 6576 mb.mu.RUnlock() 6577 } 6578 fs.mu.RUnlock() 6579 return sz 6580 } 6581 6582 // Will return total number of dmapEntries for all msg blocks. 6583 func (fs *fileStore) dmapEntries() int { 6584 var total int 6585 fs.mu.RLock() 6586 for _, mb := range fs.blks { 6587 total += mb.dmap.Size() 6588 } 6589 fs.mu.RUnlock() 6590 return total 6591 } 6592 6593 // Fixed helper for iterating. 6594 func subjectsEqual(a, b string) bool { 6595 return a == b 6596 } 6597 6598 func subjectsAll(a, b string) bool { 6599 return true 6600 } 6601 6602 func compareFn(subject string) func(string, string) bool { 6603 if subject == _EMPTY_ || subject == fwcs { 6604 return subjectsAll 6605 } 6606 if subjectHasWildcard(subject) { 6607 return subjectIsSubsetMatch 6608 } 6609 return subjectsEqual 6610 } 6611 6612 // PurgeEx will remove messages based on subject filters, sequence and number of messages to keep. 6613 // Will return the number of purged messages. 6614 func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint64, err error) { 6615 if subject == _EMPTY_ || subject == fwcs { 6616 if keep == 0 && sequence == 0 { 6617 return fs.Purge() 6618 } 6619 if sequence > 1 { 6620 return fs.Compact(sequence) 6621 } 6622 } 6623 6624 eq, wc := compareFn(subject), subjectHasWildcard(subject) 6625 var firstSeqNeedsUpdate bool 6626 var bytes uint64 6627 6628 // If we have a "keep" designation need to get full filtered state so we know how many to purge. 6629 var maxp uint64 6630 if keep > 0 { 6631 ss := fs.FilteredState(1, subject) 6632 if keep >= ss.Msgs { 6633 return 0, nil 6634 } 6635 maxp = ss.Msgs - keep 6636 } 6637 6638 var smv StoreMsg 6639 6640 fs.mu.Lock() 6641 // We may remove blocks as we purge, so don't range directly on fs.blks 6642 // otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528) 6643 for i := 0; i < len(fs.blks); i++ { 6644 mb := fs.blks[i] 6645 mb.mu.Lock() 6646 6647 // If we do not have our fss, try to expire the cache if we have no items in this block. 6648 shouldExpire := mb.fssNotLoaded() 6649 6650 t, f, l := mb.filteredPendingLocked(subject, wc, atomic.LoadUint64(&mb.first.seq)) 6651 if t == 0 { 6652 // Expire if we were responsible for loading. 6653 if shouldExpire { 6654 // Expire this cache before moving on. 6655 mb.tryForceExpireCacheLocked() 6656 } 6657 mb.mu.Unlock() 6658 continue 6659 } 6660 6661 if sequence > 1 && sequence <= l { 6662 l = sequence - 1 6663 } 6664 6665 if mb.cacheNotLoaded() { 6666 mb.loadMsgsWithLock() 6667 shouldExpire = true 6668 } 6669 6670 for seq := f; seq <= l; seq++ { 6671 if sm, _ := mb.cacheLookup(seq, &smv); sm != nil && eq(sm.subj, subject) { 6672 rl := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 6673 // Do fast in place remove. 6674 // Stats 6675 if mb.msgs > 0 { 6676 // Msgs 6677 fs.state.Msgs-- 6678 mb.msgs-- 6679 // Bytes, make sure to not go negative. 6680 if rl > fs.state.Bytes { 6681 rl = fs.state.Bytes 6682 } 6683 if rl > mb.bytes { 6684 rl = mb.bytes 6685 } 6686 fs.state.Bytes -= rl 6687 mb.bytes -= rl 6688 // Totals 6689 purged++ 6690 bytes += rl 6691 } 6692 // FSS updates. 6693 mb.removeSeqPerSubject(sm.subj, seq) 6694 fs.removePerSubject(sm.subj) 6695 6696 // Check for first message. 6697 if seq == atomic.LoadUint64(&mb.first.seq) { 6698 mb.selectNextFirst() 6699 if mb.isEmpty() { 6700 fs.removeMsgBlock(mb) 6701 i-- 6702 // keep flag set, if set previously 6703 firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq 6704 } else if seq == fs.state.FirstSeq { 6705 fs.state.FirstSeq = atomic.LoadUint64(&mb.first.seq) // new one. 6706 fs.state.FirstTime = time.Unix(0, mb.first.ts).UTC() 6707 } 6708 } else { 6709 // Out of order delete. 6710 mb.dmap.Insert(seq) 6711 } 6712 6713 if maxp > 0 && purged >= maxp { 6714 break 6715 } 6716 } 6717 } 6718 // Expire if we were responsible for loading. 6719 if shouldExpire { 6720 // Expire this cache before moving on. 6721 mb.tryForceExpireCacheLocked() 6722 } 6723 mb.mu.Unlock() 6724 6725 // Check if we should break out of top level too. 6726 if maxp > 0 && purged >= maxp { 6727 break 6728 } 6729 } 6730 if firstSeqNeedsUpdate { 6731 fs.selectNextFirst() 6732 } 6733 6734 fs.dirty++ 6735 cb := fs.scb 6736 fs.mu.Unlock() 6737 6738 if cb != nil { 6739 cb(-int64(purged), -int64(bytes), 0, _EMPTY_) 6740 } 6741 6742 return purged, nil 6743 } 6744 6745 // Purge will remove all messages from this store. 6746 // Will return the number of purged messages. 6747 func (fs *fileStore) Purge() (uint64, error) { 6748 return fs.purge(0) 6749 } 6750 6751 func (fs *fileStore) purge(fseq uint64) (uint64, error) { 6752 fs.mu.Lock() 6753 if fs.closed { 6754 fs.mu.Unlock() 6755 return 0, ErrStoreClosed 6756 } 6757 6758 purged := fs.state.Msgs 6759 rbytes := int64(fs.state.Bytes) 6760 6761 fs.state.FirstSeq = fs.state.LastSeq + 1 6762 fs.state.FirstTime = time.Time{} 6763 6764 fs.state.Bytes = 0 6765 fs.state.Msgs = 0 6766 6767 for _, mb := range fs.blks { 6768 mb.dirtyClose() 6769 } 6770 6771 fs.blks = nil 6772 fs.lmb = nil 6773 fs.bim = make(map[uint32]*msgBlock) 6774 // Clear any per subject tracking. 6775 fs.psim, fs.tsl = fs.psim.Empty(), 0 6776 // Mark dirty 6777 fs.dirty++ 6778 6779 // Move the msgs directory out of the way, will delete out of band. 6780 // FIXME(dlc) - These can error and we need to change api above to propagate? 6781 mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) 6782 pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) 6783 // If purge directory still exists then we need to wait 6784 // in place and remove since rename would fail. 6785 if _, err := os.Stat(pdir); err == nil { 6786 <-dios 6787 os.RemoveAll(pdir) 6788 dios <- struct{}{} 6789 } 6790 6791 <-dios 6792 os.Rename(mdir, pdir) 6793 dios <- struct{}{} 6794 6795 go func() { 6796 <-dios 6797 os.RemoveAll(pdir) 6798 dios <- struct{}{} 6799 }() 6800 6801 // Create new one. 6802 <-dios 6803 os.MkdirAll(mdir, defaultDirPerms) 6804 dios <- struct{}{} 6805 6806 // Make sure we have a lmb to write to. 6807 if _, err := fs.newMsgBlockForWrite(); err != nil { 6808 fs.mu.Unlock() 6809 return purged, err 6810 } 6811 6812 // Check if we need to set the first seq to a new number. 6813 if fseq > fs.state.FirstSeq { 6814 fs.state.FirstSeq = fseq 6815 fs.state.LastSeq = fseq - 1 6816 } 6817 6818 lmb := fs.lmb 6819 atomic.StoreUint64(&lmb.first.seq, fs.state.FirstSeq) 6820 atomic.StoreUint64(&lmb.last.seq, fs.state.LastSeq) 6821 lmb.last.ts = fs.state.LastTime.UnixNano() 6822 6823 if lseq := atomic.LoadUint64(&lmb.last.seq); lseq > 1 { 6824 // Leave a tombstone so we can remember our starting sequence in case 6825 // full state becomes corrupted. 6826 lmb.writeTombstone(lseq, lmb.last.ts) 6827 } 6828 6829 cb := fs.scb 6830 fs.mu.Unlock() 6831 6832 if cb != nil { 6833 cb(-int64(purged), -rbytes, 0, _EMPTY_) 6834 } 6835 6836 return purged, nil 6837 } 6838 6839 // Compact will remove all messages from this store up to 6840 // but not including the seq parameter. 6841 // Will return the number of purged messages. 6842 func (fs *fileStore) Compact(seq uint64) (uint64, error) { 6843 if seq == 0 { 6844 return fs.purge(seq) 6845 } 6846 6847 var purged, bytes uint64 6848 6849 fs.mu.Lock() 6850 // Same as purge all. 6851 if lseq := fs.state.LastSeq; seq > lseq { 6852 fs.mu.Unlock() 6853 return fs.purge(seq) 6854 } 6855 // We have to delete interior messages. 6856 smb := fs.selectMsgBlock(seq) 6857 if smb == nil { 6858 fs.mu.Unlock() 6859 return 0, nil 6860 } 6861 6862 // All msgblocks up to this one can be thrown away. 6863 var deleted int 6864 for _, mb := range fs.blks { 6865 if mb == smb { 6866 break 6867 } 6868 mb.mu.Lock() 6869 purged += mb.msgs 6870 bytes += mb.bytes 6871 // Make sure we do subject cleanup as well. 6872 mb.ensurePerSubjectInfoLoaded() 6873 for subj, ss := range mb.fss { 6874 for i := uint64(0); i < ss.Msgs; i++ { 6875 fs.removePerSubject(subj) 6876 } 6877 } 6878 // Now close. 6879 mb.dirtyCloseWithRemove(true) 6880 mb.mu.Unlock() 6881 deleted++ 6882 } 6883 6884 var smv StoreMsg 6885 var err error 6886 6887 smb.mu.Lock() 6888 if atomic.LoadUint64(&smb.first.seq) == seq { 6889 fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq) 6890 fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC() 6891 goto SKIP 6892 } 6893 6894 // Make sure we have the messages loaded. 6895 if smb.cacheNotLoaded() { 6896 if err = smb.loadMsgsWithLock(); err != nil { 6897 goto SKIP 6898 } 6899 } 6900 for mseq := atomic.LoadUint64(&smb.first.seq); mseq < seq; mseq++ { 6901 sm, err := smb.cacheLookup(mseq, &smv) 6902 if err == errDeletedMsg { 6903 // Update dmap. 6904 if !smb.dmap.IsEmpty() { 6905 smb.dmap.Delete(seq) 6906 } 6907 } else if sm != nil { 6908 sz := fileStoreMsgSize(sm.subj, sm.hdr, sm.msg) 6909 if smb.msgs > 0 { 6910 smb.msgs-- 6911 if sz > smb.bytes { 6912 sz = smb.bytes 6913 } 6914 smb.bytes -= sz 6915 bytes += sz 6916 purged++ 6917 } 6918 // Update fss 6919 smb.removeSeqPerSubject(sm.subj, mseq) 6920 fs.removePerSubject(sm.subj) 6921 } 6922 } 6923 6924 // Check if empty after processing, could happen if tail of messages are all deleted. 6925 if isEmpty := smb.msgs == 0; isEmpty { 6926 // Only remove if not the last block. 6927 if smb != fs.lmb { 6928 smb.dirtyCloseWithRemove(true) 6929 deleted++ 6930 } 6931 // Update fs first here as well. 6932 fs.state.FirstSeq = atomic.LoadUint64(&smb.last.seq) + 1 6933 fs.state.FirstTime = time.Time{} 6934 6935 } else { 6936 // Make sure to sync changes. 6937 smb.needSync = true 6938 // Update fs first seq and time. 6939 atomic.StoreUint64(&smb.first.seq, seq-1) // Just for start condition for selectNextFirst. 6940 smb.selectNextFirst() 6941 6942 fs.state.FirstSeq = atomic.LoadUint64(&smb.first.seq) 6943 fs.state.FirstTime = time.Unix(0, smb.first.ts).UTC() 6944 6945 // Check if we should reclaim the head space from this block. 6946 // This will be optimistic only, so don't continue if we encounter any errors here. 6947 if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes { 6948 var moff uint32 6949 moff, _, _, err = smb.slotInfo(int(atomic.LoadUint64(&smb.first.seq) - smb.cache.fseq)) 6950 if err != nil || moff >= uint32(len(smb.cache.buf)) { 6951 goto SKIP 6952 } 6953 buf := smb.cache.buf[moff:] 6954 // Don't reuse, copy to new recycled buf. 6955 nbuf := getMsgBlockBuf(len(buf)) 6956 nbuf = append(nbuf, buf...) 6957 smb.closeFDsLockedNoCheck() 6958 // Check for encryption. 6959 if smb.bek != nil && len(nbuf) > 0 { 6960 // Recreate to reset counter. 6961 bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce) 6962 if err != nil { 6963 goto SKIP 6964 } 6965 // For future writes make sure to set smb.bek to keep counter correct. 6966 smb.bek = bek 6967 smb.bek.XORKeyStream(nbuf, nbuf) 6968 } 6969 // Recompress if necessary (smb.cmp contains the algorithm used when 6970 // the block was loaded from disk, or defaults to NoCompression if not) 6971 if nbuf, err = smb.cmp.Compress(nbuf); err != nil { 6972 goto SKIP 6973 } 6974 <-dios 6975 err = os.WriteFile(smb.mfn, nbuf, defaultFilePerms) 6976 dios <- struct{}{} 6977 if err != nil { 6978 goto SKIP 6979 } 6980 // Make sure to remove fss state. 6981 smb.fss = nil 6982 smb.clearCacheAndOffset() 6983 smb.rbytes = uint64(len(nbuf)) 6984 } 6985 } 6986 6987 SKIP: 6988 smb.mu.Unlock() 6989 6990 if deleted > 0 { 6991 // Update block map. 6992 if fs.bim != nil { 6993 for _, mb := range fs.blks[:deleted] { 6994 delete(fs.bim, mb.index) 6995 } 6996 } 6997 // Update blks slice. 6998 fs.blks = copyMsgBlocks(fs.blks[deleted:]) 6999 if lb := len(fs.blks); lb == 0 { 7000 fs.lmb = nil 7001 } else { 7002 fs.lmb = fs.blks[lb-1] 7003 } 7004 } 7005 7006 // Update top level accounting. 7007 if purged > fs.state.Msgs { 7008 purged = fs.state.Msgs 7009 } 7010 fs.state.Msgs -= purged 7011 7012 if bytes > fs.state.Bytes { 7013 bytes = fs.state.Bytes 7014 } 7015 fs.state.Bytes -= bytes 7016 7017 fs.dirty++ 7018 7019 cb := fs.scb 7020 fs.mu.Unlock() 7021 7022 if cb != nil && purged > 0 { 7023 cb(-int64(purged), -int64(bytes), 0, _EMPTY_) 7024 } 7025 7026 return purged, err 7027 } 7028 7029 // Will completely reset our store. 7030 func (fs *fileStore) reset() error { 7031 fs.mu.Lock() 7032 if fs.closed { 7033 fs.mu.Unlock() 7034 return ErrStoreClosed 7035 } 7036 if fs.sips > 0 { 7037 fs.mu.Unlock() 7038 return ErrStoreSnapshotInProgress 7039 } 7040 7041 var purged, bytes uint64 7042 cb := fs.scb 7043 7044 for _, mb := range fs.blks { 7045 mb.mu.Lock() 7046 purged += mb.msgs 7047 bytes += mb.bytes 7048 mb.dirtyCloseWithRemove(true) 7049 mb.mu.Unlock() 7050 } 7051 7052 // Reset 7053 fs.state.FirstSeq = 0 7054 fs.state.FirstTime = time.Time{} 7055 fs.state.LastSeq = 0 7056 fs.state.LastTime = time.Now().UTC() 7057 // Update msgs and bytes. 7058 fs.state.Msgs = 0 7059 fs.state.Bytes = 0 7060 7061 // Reset blocks. 7062 fs.blks, fs.lmb = nil, nil 7063 7064 // Reset subject mappings. 7065 fs.psim, fs.tsl = fs.psim.Empty(), 0 7066 fs.bim = make(map[uint32]*msgBlock) 7067 7068 // If we purged anything, make sure we kick flush state loop. 7069 if purged > 0 { 7070 fs.dirty++ 7071 } 7072 7073 fs.mu.Unlock() 7074 7075 if cb != nil { 7076 cb(-int64(purged), -int64(bytes), 0, _EMPTY_) 7077 } 7078 7079 return nil 7080 } 7081 7082 // Truncate will truncate a stream store up to seq. Sequence needs to be valid. 7083 func (fs *fileStore) Truncate(seq uint64) error { 7084 // Check for request to reset. 7085 if seq == 0 { 7086 return fs.reset() 7087 } 7088 7089 fs.mu.Lock() 7090 7091 if fs.closed { 7092 fs.mu.Unlock() 7093 return ErrStoreClosed 7094 } 7095 if fs.sips > 0 { 7096 fs.mu.Unlock() 7097 return ErrStoreSnapshotInProgress 7098 } 7099 7100 nlmb := fs.selectMsgBlock(seq) 7101 if nlmb == nil { 7102 fs.mu.Unlock() 7103 return ErrInvalidSequence 7104 } 7105 lsm, _, _ := nlmb.fetchMsg(seq, nil) 7106 if lsm == nil { 7107 fs.mu.Unlock() 7108 return ErrInvalidSequence 7109 } 7110 7111 // Set lmb to nlmb and make sure writeable. 7112 fs.lmb = nlmb 7113 if err := nlmb.enableForWriting(fs.fip); err != nil { 7114 fs.mu.Unlock() 7115 return err 7116 } 7117 7118 var purged, bytes uint64 7119 7120 // Truncate our new last message block. 7121 nmsgs, nbytes, err := nlmb.truncate(lsm) 7122 if err != nil { 7123 fs.mu.Unlock() 7124 return fmt.Errorf("nlmb.truncate: %w", err) 7125 } 7126 // Account for the truncated msgs and bytes. 7127 purged += nmsgs 7128 bytes += nbytes 7129 7130 // Remove any left over msg blocks. 7131 getLastMsgBlock := func() *msgBlock { return fs.blks[len(fs.blks)-1] } 7132 for mb := getLastMsgBlock(); mb != nlmb; mb = getLastMsgBlock() { 7133 mb.mu.Lock() 7134 purged += mb.msgs 7135 bytes += mb.bytes 7136 fs.removeMsgBlock(mb) 7137 mb.mu.Unlock() 7138 } 7139 7140 // Reset last. 7141 fs.state.LastSeq = lsm.seq 7142 fs.state.LastTime = time.Unix(0, lsm.ts).UTC() 7143 // Update msgs and bytes. 7144 if purged > fs.state.Msgs { 7145 purged = fs.state.Msgs 7146 } 7147 fs.state.Msgs -= purged 7148 if bytes > fs.state.Bytes { 7149 bytes = fs.state.Bytes 7150 } 7151 fs.state.Bytes -= bytes 7152 7153 // Reset our subject lookup info. 7154 fs.resetGlobalPerSubjectInfo() 7155 7156 fs.dirty++ 7157 7158 cb := fs.scb 7159 fs.mu.Unlock() 7160 7161 if cb != nil { 7162 cb(-int64(purged), -int64(bytes), 0, _EMPTY_) 7163 } 7164 7165 return nil 7166 } 7167 7168 func (fs *fileStore) lastSeq() uint64 { 7169 fs.mu.RLock() 7170 seq := fs.state.LastSeq 7171 fs.mu.RUnlock() 7172 return seq 7173 } 7174 7175 // Returns number of msg blks. 7176 func (fs *fileStore) numMsgBlocks() int { 7177 fs.mu.RLock() 7178 defer fs.mu.RUnlock() 7179 return len(fs.blks) 7180 } 7181 7182 // Will add a new msgBlock. 7183 // Lock should be held. 7184 func (fs *fileStore) addMsgBlock(mb *msgBlock) { 7185 fs.blks = append(fs.blks, mb) 7186 fs.lmb = mb 7187 fs.bim[mb.index] = mb 7188 } 7189 7190 // Remove from our list of blks. 7191 // Both locks should be held. 7192 func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) { 7193 // Remove from list. 7194 for i, omb := range fs.blks { 7195 if mb == omb { 7196 fs.dirty++ 7197 blks := append(fs.blks[:i], fs.blks[i+1:]...) 7198 fs.blks = copyMsgBlocks(blks) 7199 if fs.bim != nil { 7200 delete(fs.bim, mb.index) 7201 } 7202 break 7203 } 7204 } 7205 } 7206 7207 // Removes the msgBlock 7208 // Both locks should be held. 7209 func (fs *fileStore) removeMsgBlock(mb *msgBlock) { 7210 mb.dirtyCloseWithRemove(true) 7211 fs.removeMsgBlockFromList(mb) 7212 // Check for us being last message block 7213 if mb == fs.lmb { 7214 lseq, lts := atomic.LoadUint64(&mb.last.seq), mb.last.ts 7215 // Creating a new message write block requires that the lmb lock is not held. 7216 mb.mu.Unlock() 7217 // Write the tombstone to remember since this was last block. 7218 if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { 7219 lmb.writeTombstone(lseq, lts) 7220 } 7221 mb.mu.Lock() 7222 } 7223 } 7224 7225 // Called by purge to simply get rid of the cache and close our fds. 7226 // Lock should not be held. 7227 func (mb *msgBlock) dirtyClose() { 7228 mb.mu.Lock() 7229 defer mb.mu.Unlock() 7230 mb.dirtyCloseWithRemove(false) 7231 } 7232 7233 // Should be called with lock held. 7234 func (mb *msgBlock) dirtyCloseWithRemove(remove bool) { 7235 if mb == nil { 7236 return 7237 } 7238 // Stop cache expiration timer. 7239 if mb.ctmr != nil { 7240 mb.ctmr.Stop() 7241 mb.ctmr = nil 7242 } 7243 // Clear any tracking by subject. 7244 mb.fss = nil 7245 // Close cache 7246 mb.clearCacheAndOffset() 7247 // Quit our loops. 7248 if mb.qch != nil { 7249 close(mb.qch) 7250 mb.qch = nil 7251 } 7252 if mb.mfd != nil { 7253 mb.mfd.Close() 7254 mb.mfd = nil 7255 } 7256 if remove { 7257 if mb.mfn != _EMPTY_ { 7258 os.Remove(mb.mfn) 7259 mb.mfn = _EMPTY_ 7260 } 7261 if mb.kfn != _EMPTY_ { 7262 os.Remove(mb.kfn) 7263 } 7264 } 7265 } 7266 7267 // Remove a seq from the fss and select new first. 7268 // Lock should be held. 7269 func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) { 7270 mb.ensurePerSubjectInfoLoaded() 7271 ss := mb.fss[subj] 7272 if ss == nil { 7273 return 7274 } 7275 7276 if ss.Msgs == 1 { 7277 delete(mb.fss, subj) 7278 return 7279 } 7280 7281 ss.Msgs-- 7282 7283 // Only one left. 7284 if ss.Msgs == 1 { 7285 if seq == ss.Last { 7286 ss.Last = ss.First 7287 } else { 7288 ss.First = ss.Last 7289 } 7290 ss.firstNeedsUpdate = false 7291 return 7292 } 7293 7294 // We can lazily calculate the first sequence when needed. 7295 ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate 7296 } 7297 7298 // Will recalulate the first sequence for this subject in this block. 7299 // Will avoid slower path message lookups and scan the cache directly instead. 7300 func (mb *msgBlock) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) { 7301 // Need to make sure messages are loaded. 7302 if mb.cacheNotLoaded() { 7303 if err := mb.loadMsgsWithLock(); err != nil { 7304 return 7305 } 7306 } 7307 7308 // Mark first as updated. 7309 ss.firstNeedsUpdate = false 7310 7311 startSlot := int(startSeq - mb.cache.fseq) 7312 if startSlot >= len(mb.cache.idx) { 7313 ss.First = ss.Last 7314 return 7315 } else if startSlot < 0 { 7316 startSlot = 0 7317 } 7318 7319 var le = binary.LittleEndian 7320 for slot, fseq := startSlot, atomic.LoadUint64(&mb.first.seq); slot < len(mb.cache.idx); slot++ { 7321 bi := mb.cache.idx[slot] &^ hbit 7322 if bi == dbit { 7323 // delete marker so skip. 7324 continue 7325 } 7326 li := int(bi) - mb.cache.off 7327 if li >= len(mb.cache.buf) { 7328 ss.First = ss.Last 7329 return 7330 } 7331 buf := mb.cache.buf[li:] 7332 hdr := buf[:msgHdrSize] 7333 slen := int(le.Uint16(hdr[20:])) 7334 if subj == bytesToString(buf[msgHdrSize:msgHdrSize+slen]) { 7335 seq := le.Uint64(hdr[4:]) 7336 if seq < fseq || seq&ebit != 0 || mb.dmap.Exists(seq) { 7337 continue 7338 } 7339 ss.First = seq 7340 return 7341 } 7342 } 7343 } 7344 7345 // Lock should be held. 7346 func (fs *fileStore) resetGlobalPerSubjectInfo() { 7347 // Clear any global subject state. 7348 fs.psim, fs.tsl = fs.psim.Empty(), 0 7349 for _, mb := range fs.blks { 7350 fs.populateGlobalPerSubjectInfo(mb) 7351 } 7352 } 7353 7354 // Lock should be held. 7355 func (mb *msgBlock) resetPerSubjectInfo() error { 7356 mb.fss = nil 7357 return mb.generatePerSubjectInfo() 7358 } 7359 7360 // generatePerSubjectInfo will generate the per subject info via the raw msg block. 7361 // Lock should be held. 7362 func (mb *msgBlock) generatePerSubjectInfo() error { 7363 // Check if this mb is empty. This can happen when its the last one and we are holding onto it for seq and timestamp info. 7364 if mb.msgs == 0 { 7365 return nil 7366 } 7367 7368 if mb.cacheNotLoaded() { 7369 if err := mb.loadMsgsWithLock(); err != nil { 7370 return err 7371 } 7372 // indexCacheBuf can produce fss now, so if non-nil we are good. 7373 if mb.fss != nil { 7374 return nil 7375 } 7376 } 7377 7378 // Create new one regardless. 7379 mb.fss = make(map[string]*SimpleState) 7380 7381 var smv StoreMsg 7382 fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) 7383 for seq := fseq; seq <= lseq; seq++ { 7384 sm, err := mb.cacheLookup(seq, &smv) 7385 if err != nil { 7386 // Since we are walking by sequence we can ignore some errors that are benign to rebuilding our state. 7387 if err == ErrStoreMsgNotFound || err == errDeletedMsg { 7388 continue 7389 } 7390 if err == errNoCache { 7391 return nil 7392 } 7393 return err 7394 } 7395 if sm != nil && len(sm.subj) > 0 { 7396 if ss := mb.fss[sm.subj]; ss != nil { 7397 ss.Msgs++ 7398 ss.Last = seq 7399 } else { 7400 mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq} 7401 } 7402 } 7403 } 7404 7405 if len(mb.fss) > 0 { 7406 // Make sure we run the cache expire timer. 7407 mb.llts = time.Now().UnixNano() 7408 mb.startCacheExpireTimer() 7409 } 7410 return nil 7411 } 7412 7413 // Helper to make sure fss loaded if we are tracking. 7414 // Lock should be held 7415 func (mb *msgBlock) ensurePerSubjectInfoLoaded() error { 7416 if mb.fss != nil || mb.noTrack { 7417 return nil 7418 } 7419 if mb.msgs == 0 { 7420 mb.fss = make(map[string]*SimpleState) 7421 return nil 7422 } 7423 return mb.generatePerSubjectInfo() 7424 } 7425 7426 // Called on recovery to populate the global psim state. 7427 // Lock should be held. 7428 func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { 7429 mb.mu.Lock() 7430 defer mb.mu.Unlock() 7431 7432 if err := mb.ensurePerSubjectInfoLoaded(); err != nil { 7433 return 7434 } 7435 7436 // Now populate psim. 7437 for subj, ss := range mb.fss { 7438 if len(subj) > 0 { 7439 bsubj := stringToBytes(subj) 7440 if info, ok := fs.psim.Find(bsubj); ok { 7441 info.total += ss.Msgs 7442 if mb.index > info.lblk { 7443 info.lblk = mb.index 7444 } 7445 } else { 7446 fs.psim.Insert(bsubj, psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index}) 7447 fs.tsl += len(subj) 7448 } 7449 } 7450 } 7451 } 7452 7453 // Close the message block. 7454 func (mb *msgBlock) close(sync bool) { 7455 if mb == nil { 7456 return 7457 } 7458 mb.mu.Lock() 7459 defer mb.mu.Unlock() 7460 7461 if mb.closed { 7462 return 7463 } 7464 7465 // Stop cache expiration timer. 7466 if mb.ctmr != nil { 7467 mb.ctmr.Stop() 7468 mb.ctmr = nil 7469 } 7470 7471 // Clear fss. 7472 mb.fss = nil 7473 7474 // Close cache 7475 mb.clearCacheAndOffset() 7476 // Quit our loops. 7477 if mb.qch != nil { 7478 close(mb.qch) 7479 mb.qch = nil 7480 } 7481 if mb.mfd != nil { 7482 if sync { 7483 mb.mfd.Sync() 7484 } 7485 mb.mfd.Close() 7486 } 7487 mb.mfd = nil 7488 // Mark as closed. 7489 mb.closed = true 7490 } 7491 7492 func (fs *fileStore) closeAllMsgBlocks(sync bool) { 7493 for _, mb := range fs.blks { 7494 mb.close(sync) 7495 } 7496 } 7497 7498 func (fs *fileStore) Delete() error { 7499 if fs.isClosed() { 7500 // Always attempt to remove since we could have been closed beforehand. 7501 os.RemoveAll(fs.fcfg.StoreDir) 7502 // Since we did remove, if we did have anything remaining make sure to 7503 // call into any storage updates that had been registered. 7504 fs.mu.Lock() 7505 cb, msgs, bytes := fs.scb, int64(fs.state.Msgs), int64(fs.state.Bytes) 7506 // Guard against double accounting if called twice. 7507 fs.state.Msgs, fs.state.Bytes = 0, 0 7508 fs.mu.Unlock() 7509 if msgs > 0 && cb != nil { 7510 cb(-msgs, -bytes, 0, _EMPTY_) 7511 } 7512 return ErrStoreClosed 7513 } 7514 7515 pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) 7516 // If purge directory still exists then we need to wait 7517 // in place and remove since rename would fail. 7518 if _, err := os.Stat(pdir); err == nil { 7519 os.RemoveAll(pdir) 7520 } 7521 7522 // Do Purge() since if we have lots of blocks uses a mv/rename. 7523 fs.Purge() 7524 7525 if err := fs.stop(false); err != nil { 7526 return err 7527 } 7528 7529 // Make sure we will not try to recover if killed before removal below completes. 7530 if err := os.Remove(filepath.Join(fs.fcfg.StoreDir, JetStreamMetaFile)); err != nil { 7531 return err 7532 } 7533 // Now move into different directory with "." prefix. 7534 ndir := filepath.Join(filepath.Dir(fs.fcfg.StoreDir), tsep+filepath.Base(fs.fcfg.StoreDir)) 7535 if err := os.Rename(fs.fcfg.StoreDir, ndir); err != nil { 7536 return err 7537 } 7538 // Do this in separate Go routine in case lots of blocks. 7539 // Purge above protects us as does the removal of meta artifacts above. 7540 go func() { 7541 err := os.RemoveAll(ndir) 7542 if err == nil { 7543 return 7544 } 7545 ttl := time.Now().Add(time.Second) 7546 for time.Now().Before(ttl) { 7547 time.Sleep(10 * time.Millisecond) 7548 if err = os.RemoveAll(ndir); err == nil { 7549 return 7550 } 7551 } 7552 }() 7553 7554 return nil 7555 } 7556 7557 // Lock should be held. 7558 func (fs *fileStore) setSyncTimer() { 7559 if fs.syncTmr != nil { 7560 fs.syncTmr.Reset(fs.fcfg.SyncInterval) 7561 } else { 7562 fs.syncTmr = time.AfterFunc(fs.fcfg.SyncInterval, fs.syncBlocks) 7563 } 7564 } 7565 7566 // Lock should be held. 7567 func (fs *fileStore) cancelSyncTimer() { 7568 if fs.syncTmr != nil { 7569 fs.syncTmr.Stop() 7570 fs.syncTmr = nil 7571 } 7572 } 7573 7574 const ( 7575 fullStateMagic = uint8(11) 7576 fullStateVersion = uint8(1) 7577 ) 7578 7579 // This go routine runs and receives kicks to write out our full stream state index. 7580 // This will get kicked when we create a new block or when we delete a block in general. 7581 // This is also called during Stop(). 7582 func (fs *fileStore) flushStreamStateLoop(qch, done chan struct{}) { 7583 // Signal we are done on exit. 7584 defer close(done) 7585 7586 // Make sure we do not try to write these out too fast. 7587 const writeThreshold = 2 * time.Minute 7588 t := time.NewTicker(writeThreshold) 7589 defer t.Stop() 7590 7591 for { 7592 select { 7593 case <-t.C: 7594 fs.writeFullState() 7595 case <-qch: 7596 return 7597 } 7598 } 7599 } 7600 7601 // Helper since unixnano of zero time undefined. 7602 func timestampNormalized(t time.Time) int64 { 7603 if t.IsZero() { 7604 return 0 7605 } 7606 return t.UnixNano() 7607 } 7608 7609 // writeFullState will proceed to write the full meta state iff not complex and time consuming. 7610 // Since this is for quick recovery it is optional and should not block/stall normal operations. 7611 func (fs *fileStore) writeFullState() error { 7612 return fs._writeFullState(false) 7613 } 7614 7615 // forceWriteFullState will proceed to write the full meta state. This should only be called by stop() 7616 func (fs *fileStore) forceWriteFullState() error { 7617 return fs._writeFullState(true) 7618 } 7619 7620 // This will write the full binary state for the stream. 7621 // This plus everything new since last hash will be the total recovered state. 7622 // This state dump will have the following. 7623 // 1. Stream summary - Msgs, Bytes, First and Last (Sequence and Timestamp) 7624 // 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present. 7625 // 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset). 7626 // 4. Last block index and hash of record inclusive to this stream state. 7627 func (fs *fileStore) _writeFullState(force bool) error { 7628 fs.mu.Lock() 7629 if fs.closed || fs.dirty == 0 { 7630 fs.mu.Unlock() 7631 return nil 7632 } 7633 7634 // For calculating size and checking time costs for non forced calls. 7635 numSubjects := fs.numSubjects() 7636 7637 // If we are not being forced to write out our state, check the complexity for time costs as to not 7638 // block or stall normal operations. 7639 // We will base off of number of subjects and interior deletes. A very large number of msg blocks could also 7640 // be used, but for next server version will redo all meta handling to be disk based. So this is temporary. 7641 if !force { 7642 const numThreshold = 1_000_000 7643 // Calculate interior deletes. 7644 var numDeleted int 7645 if fs.state.LastSeq > fs.state.FirstSeq { 7646 numDeleted = int((fs.state.LastSeq - fs.state.FirstSeq + 1) - fs.state.Msgs) 7647 } 7648 if numSubjects > numThreshold || numDeleted > numThreshold { 7649 fs.mu.Unlock() 7650 return errStateTooBig 7651 } 7652 } 7653 7654 // We track this through subsequent runs to get an avg per blk used for subsequent runs. 7655 avgDmapLen := fs.adml 7656 // If first time through could be 0 7657 if avgDmapLen == 0 && ((fs.state.LastSeq-fs.state.FirstSeq+1)-fs.state.Msgs) > 0 { 7658 avgDmapLen = 1024 7659 } 7660 7661 // Calculate and estimate of the uper bound on the size to avoid multiple allocations. 7662 sz := hdrLen + // Magic and Version 7663 (binary.MaxVarintLen64 * 6) + // FS data 7664 binary.MaxVarintLen64 + fs.tsl + // NumSubjects + total subject length 7665 numSubjects*(binary.MaxVarintLen64*4) + // psi record 7666 binary.MaxVarintLen64 + // Num blocks. 7667 len(fs.blks)*((binary.MaxVarintLen64*7)+avgDmapLen) + // msg blocks, avgDmapLen is est for dmaps 7668 binary.MaxVarintLen64 + 8 + 8 // last index + record checksum + full state checksum 7669 7670 // Do 4k on stack if possible. 7671 const ssz = 4 * 1024 7672 var buf []byte 7673 7674 if sz <= ssz { 7675 var _buf [ssz]byte 7676 buf, sz = _buf[0:hdrLen:ssz], ssz 7677 } else { 7678 buf = make([]byte, hdrLen, sz) 7679 } 7680 7681 buf[0], buf[1] = fullStateMagic, fullStateVersion 7682 buf = binary.AppendUvarint(buf, fs.state.Msgs) 7683 buf = binary.AppendUvarint(buf, fs.state.Bytes) 7684 buf = binary.AppendUvarint(buf, fs.state.FirstSeq) 7685 buf = binary.AppendVarint(buf, timestampNormalized(fs.state.FirstTime)) 7686 buf = binary.AppendUvarint(buf, fs.state.LastSeq) 7687 buf = binary.AppendVarint(buf, timestampNormalized(fs.state.LastTime)) 7688 7689 // Do per subject information map if applicable. 7690 buf = binary.AppendUvarint(buf, uint64(numSubjects)) 7691 if numSubjects > 0 { 7692 fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) { 7693 buf = binary.AppendUvarint(buf, uint64(len(subj))) 7694 buf = append(buf, subj...) 7695 buf = binary.AppendUvarint(buf, psi.total) 7696 buf = binary.AppendUvarint(buf, uint64(psi.fblk)) 7697 if psi.total > 1 { 7698 buf = binary.AppendUvarint(buf, uint64(psi.lblk)) 7699 } 7700 }) 7701 } 7702 7703 // Now walk all blocks and write out first and last and optional dmap encoding. 7704 var lbi uint32 7705 var lchk [8]byte 7706 7707 nb := len(fs.blks) 7708 buf = binary.AppendUvarint(buf, uint64(nb)) 7709 7710 // Use basetime to save some space. 7711 baseTime := timestampNormalized(fs.state.FirstTime) 7712 var scratch [8 * 1024]byte 7713 7714 // Track the state as represented by the mbs. 7715 var mstate StreamState 7716 7717 var dmapTotalLen int 7718 for _, mb := range fs.blks { 7719 mb.mu.RLock() 7720 buf = binary.AppendUvarint(buf, uint64(mb.index)) 7721 buf = binary.AppendUvarint(buf, mb.bytes) 7722 buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.first.seq)) 7723 buf = binary.AppendVarint(buf, mb.first.ts-baseTime) 7724 buf = binary.AppendUvarint(buf, atomic.LoadUint64(&mb.last.seq)) 7725 buf = binary.AppendVarint(buf, mb.last.ts-baseTime) 7726 7727 numDeleted := mb.dmap.Size() 7728 buf = binary.AppendUvarint(buf, uint64(numDeleted)) 7729 if numDeleted > 0 { 7730 dmap, _ := mb.dmap.Encode(scratch[:0]) 7731 dmapTotalLen += len(dmap) 7732 buf = append(buf, dmap...) 7733 } 7734 // If this is the last one grab the last checksum and the block index, e.g. 22.blk, 22 is the block index. 7735 // We use this to quickly open this file on recovery. 7736 if mb == fs.lmb { 7737 lbi = mb.index 7738 mb.ensureLastChecksumLoaded() 7739 copy(lchk[0:], mb.lchk[:]) 7740 } 7741 updateTrackingState(&mstate, mb) 7742 mb.mu.RUnlock() 7743 } 7744 if dmapTotalLen > 0 { 7745 fs.adml = dmapTotalLen / len(fs.blks) 7746 } 7747 7748 // Place block index and hash onto the end. 7749 buf = binary.AppendUvarint(buf, uint64(lbi)) 7750 buf = append(buf, lchk[:]...) 7751 7752 // Encrypt if needed. 7753 if fs.prf != nil { 7754 if err := fs.setupAEK(); err != nil { 7755 fs.mu.Unlock() 7756 return err 7757 } 7758 nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead()) 7759 if n, err := rand.Read(nonce); err != nil { 7760 return err 7761 } else if n != len(nonce) { 7762 return fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce)) 7763 } 7764 buf = fs.aek.Seal(nonce, nonce, buf, nil) 7765 } 7766 7767 fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) 7768 7769 fs.hh.Reset() 7770 fs.hh.Write(buf) 7771 buf = fs.hh.Sum(buf) 7772 7773 // Snapshot prior dirty count. 7774 priorDirty := fs.dirty 7775 7776 statesEqual := trackingStatesEqual(&fs.state, &mstate) || len(fs.blks) > 0 7777 // Release lock. 7778 fs.mu.Unlock() 7779 7780 // Check consistency here. 7781 if !statesEqual { 7782 fs.warn("Stream state encountered internal inconsistency on write") 7783 // Rebuild our fs state from the mb state. 7784 fs.rebuildState(nil) 7785 return errCorruptState 7786 } 7787 7788 if cap(buf) > sz { 7789 fs.debug("WriteFullState reallocated from %d to %d", sz, cap(buf)) 7790 } 7791 7792 // Write to a tmp file and rename. 7793 const tmpPre = streamStreamStateFile + tsep 7794 f, err := os.CreateTemp(filepath.Join(fs.fcfg.StoreDir, msgDir), tmpPre) 7795 if err != nil { 7796 return err 7797 } 7798 tmpName := f.Name() 7799 defer os.Remove(tmpName) 7800 if _, err = f.Write(buf); err == nil && fs.fcfg.SyncAlways { 7801 f.Sync() 7802 } 7803 f.Close() 7804 if err != nil { 7805 return err 7806 } 7807 7808 // Rename into position under our lock, clear prior dirty pending on success. 7809 fs.mu.Lock() 7810 if !fs.closed { 7811 if err := os.Rename(tmpName, fn); err != nil { 7812 fs.mu.Unlock() 7813 return err 7814 } 7815 fs.dirty -= priorDirty 7816 } 7817 fs.mu.Unlock() 7818 7819 return nil 7820 } 7821 7822 // Stop the current filestore. 7823 func (fs *fileStore) Stop() error { 7824 return fs.stop(true) 7825 } 7826 7827 // Stop the current filestore. 7828 func (fs *fileStore) stop(writeState bool) error { 7829 fs.mu.Lock() 7830 if fs.closed || fs.closing { 7831 fs.mu.Unlock() 7832 return ErrStoreClosed 7833 } 7834 7835 // Mark as closing. Do before releasing the lock to writeFullState 7836 // so we don't end up with this function running more than once. 7837 fs.closing = true 7838 7839 if writeState { 7840 fs.checkAndFlushAllBlocks() 7841 } 7842 fs.closeAllMsgBlocks(false) 7843 7844 fs.cancelSyncTimer() 7845 fs.cancelAgeChk() 7846 7847 // Release the state flusher loop. 7848 if fs.qch != nil { 7849 close(fs.qch) 7850 fs.qch = nil 7851 } 7852 7853 if writeState { 7854 // Wait for the state flush loop to exit. 7855 fsld := fs.fsld 7856 fs.mu.Unlock() 7857 <-fsld 7858 // Write full state if needed. If not dirty this is a no-op. 7859 fs.forceWriteFullState() 7860 fs.mu.Lock() 7861 } 7862 7863 // Mark as closed. Last message block needs to be cleared after 7864 // writeFullState has completed. 7865 fs.closed = true 7866 fs.lmb = nil 7867 7868 // We should update the upper usage layer on a stop. 7869 cb, bytes := fs.scb, int64(fs.state.Bytes) 7870 fs.mu.Unlock() 7871 7872 fs.cmu.Lock() 7873 var _cfs [256]ConsumerStore 7874 cfs := append(_cfs[:0], fs.cfs...) 7875 fs.cfs = nil 7876 fs.cmu.Unlock() 7877 7878 for _, o := range cfs { 7879 o.Stop() 7880 } 7881 7882 if bytes > 0 && cb != nil { 7883 cb(0, -bytes, 0, _EMPTY_) 7884 } 7885 7886 return nil 7887 } 7888 7889 const errFile = "errors.txt" 7890 7891 // Stream our snapshot through S2 compression and tar. 7892 func (fs *fileStore) streamSnapshot(w io.WriteCloser, includeConsumers bool) { 7893 defer w.Close() 7894 7895 enc := s2.NewWriter(w) 7896 defer enc.Close() 7897 7898 tw := tar.NewWriter(enc) 7899 defer tw.Close() 7900 7901 defer func() { 7902 fs.mu.Lock() 7903 fs.sips-- 7904 fs.mu.Unlock() 7905 }() 7906 7907 modTime := time.Now().UTC() 7908 7909 writeFile := func(name string, buf []byte) error { 7910 hdr := &tar.Header{ 7911 Name: name, 7912 Mode: 0600, 7913 ModTime: modTime, 7914 Uname: "nats", 7915 Gname: "nats", 7916 Size: int64(len(buf)), 7917 Format: tar.FormatPAX, 7918 } 7919 if err := tw.WriteHeader(hdr); err != nil { 7920 return err 7921 } 7922 if _, err := tw.Write(buf); err != nil { 7923 return err 7924 } 7925 return nil 7926 } 7927 7928 writeErr := func(err string) { 7929 writeFile(errFile, []byte(err)) 7930 } 7931 7932 fs.mu.Lock() 7933 blks := fs.blks 7934 // Grab our general meta data. 7935 // We do this now instead of pulling from files since they could be encrypted. 7936 meta, err := json.Marshal(fs.cfg) 7937 if err != nil { 7938 fs.mu.Unlock() 7939 writeErr(fmt.Sprintf("Could not gather stream meta file: %v", err)) 7940 return 7941 } 7942 hh := fs.hh 7943 hh.Reset() 7944 hh.Write(meta) 7945 sum := []byte(hex.EncodeToString(fs.hh.Sum(nil))) 7946 fs.mu.Unlock() 7947 7948 // Meta first. 7949 if writeFile(JetStreamMetaFile, meta) != nil { 7950 return 7951 } 7952 if writeFile(JetStreamMetaFileSum, sum) != nil { 7953 return 7954 } 7955 7956 // Can't use join path here, tar only recognizes relative paths with forward slashes. 7957 msgPre := msgDir + "/" 7958 var bbuf []byte 7959 7960 const minLen = 32 7961 sfn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) 7962 if buf, err := os.ReadFile(sfn); err == nil && len(buf) >= minLen { 7963 if fs.aek != nil { 7964 ns := fs.aek.NonceSize() 7965 buf, err = fs.aek.Open(nil, buf[:ns], buf[ns:len(buf)-highwayhash.Size64], nil) 7966 if err == nil { 7967 // Redo hash checksum at end on plaintext. 7968 fs.mu.Lock() 7969 hh.Reset() 7970 hh.Write(buf) 7971 buf = fs.hh.Sum(buf) 7972 fs.mu.Unlock() 7973 } 7974 } 7975 if err == nil && writeFile(msgPre+streamStreamStateFile, buf) != nil { 7976 return 7977 } 7978 } 7979 7980 // Now do messages themselves. 7981 for _, mb := range blks { 7982 if mb.pendingWriteSize() > 0 { 7983 mb.flushPendingMsgs() 7984 } 7985 mb.mu.Lock() 7986 // We could stream but don't want to hold the lock and prevent changes, so just read in and 7987 // release the lock for now. 7988 bbuf, err = mb.loadBlock(bbuf) 7989 if err != nil { 7990 mb.mu.Unlock() 7991 writeErr(fmt.Sprintf("Could not read message block [%d]: %v", mb.index, err)) 7992 return 7993 } 7994 // Check for encryption. 7995 if mb.bek != nil && len(bbuf) > 0 { 7996 rbek, err := genBlockEncryptionKey(fs.fcfg.Cipher, mb.seed, mb.nonce) 7997 if err != nil { 7998 mb.mu.Unlock() 7999 writeErr(fmt.Sprintf("Could not create encryption key for message block [%d]: %v", mb.index, err)) 8000 return 8001 } 8002 rbek.XORKeyStream(bbuf, bbuf) 8003 } 8004 // Check for compression. 8005 if bbuf, err = mb.decompressIfNeeded(bbuf); err != nil { 8006 mb.mu.Unlock() 8007 writeErr(fmt.Sprintf("Could not decompress message block [%d]: %v", mb.index, err)) 8008 return 8009 } 8010 mb.mu.Unlock() 8011 8012 // Do this one unlocked. 8013 if writeFile(msgPre+fmt.Sprintf(blkScan, mb.index), bbuf) != nil { 8014 return 8015 } 8016 } 8017 8018 // Bail if no consumers requested. 8019 if !includeConsumers { 8020 return 8021 } 8022 8023 // Do consumers' state last. 8024 fs.cmu.RLock() 8025 cfs := fs.cfs 8026 fs.cmu.RUnlock() 8027 8028 for _, cs := range cfs { 8029 o, ok := cs.(*consumerFileStore) 8030 if !ok { 8031 continue 8032 } 8033 o.mu.Lock() 8034 // Grab our general meta data. 8035 // We do this now instead of pulling from files since they could be encrypted. 8036 meta, err := json.Marshal(o.cfg) 8037 if err != nil { 8038 o.mu.Unlock() 8039 writeErr(fmt.Sprintf("Could not gather consumer meta file for %q: %v", o.name, err)) 8040 return 8041 } 8042 o.hh.Reset() 8043 o.hh.Write(meta) 8044 sum := []byte(hex.EncodeToString(o.hh.Sum(nil))) 8045 8046 // We can have the running state directly encoded now. 8047 state, err := o.encodeState() 8048 if err != nil { 8049 o.mu.Unlock() 8050 writeErr(fmt.Sprintf("Could not encode consumer state for %q: %v", o.name, err)) 8051 return 8052 } 8053 odirPre := filepath.Join(consumerDir, o.name) 8054 o.mu.Unlock() 8055 8056 // Write all the consumer files. 8057 if writeFile(filepath.Join(odirPre, JetStreamMetaFile), meta) != nil { 8058 return 8059 } 8060 if writeFile(filepath.Join(odirPre, JetStreamMetaFileSum), sum) != nil { 8061 return 8062 } 8063 writeFile(filepath.Join(odirPre, consumerState), state) 8064 } 8065 } 8066 8067 // Create a snapshot of this stream and its consumer's state along with messages. 8068 func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumers bool) (*SnapshotResult, error) { 8069 fs.mu.Lock() 8070 if fs.closed { 8071 fs.mu.Unlock() 8072 return nil, ErrStoreClosed 8073 } 8074 // Only allow one at a time. 8075 if fs.sips > 0 { 8076 fs.mu.Unlock() 8077 return nil, ErrStoreSnapshotInProgress 8078 } 8079 // Mark us as snapshotting 8080 fs.sips += 1 8081 fs.mu.Unlock() 8082 8083 if checkMsgs { 8084 ld := fs.checkMsgs() 8085 if ld != nil && len(ld.Msgs) > 0 { 8086 return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs)) 8087 } 8088 } 8089 8090 // Write out full state as well before proceeding. 8091 fs.writeFullState() 8092 8093 pr, pw := net.Pipe() 8094 8095 // Set a write deadline here to protect ourselves. 8096 if deadline > 0 { 8097 pw.SetWriteDeadline(time.Now().Add(deadline)) 8098 } 8099 8100 // We can add to our stream while snapshotting but not "user" delete anything. 8101 var state StreamState 8102 fs.FastState(&state) 8103 8104 // Stream in separate Go routine. 8105 go fs.streamSnapshot(pw, includeConsumers) 8106 8107 return &SnapshotResult{pr, state}, nil 8108 } 8109 8110 // Helper to return the config. 8111 func (fs *fileStore) fileStoreConfig() FileStoreConfig { 8112 fs.mu.RLock() 8113 defer fs.mu.RUnlock() 8114 return fs.fcfg 8115 } 8116 8117 // Read lock all existing message blocks. 8118 // Lock held on entry. 8119 func (fs *fileStore) readLockAllMsgBlocks() { 8120 for _, mb := range fs.blks { 8121 mb.mu.RLock() 8122 } 8123 } 8124 8125 // Read unlock all existing message blocks. 8126 // Lock held on entry. 8127 func (fs *fileStore) readUnlockAllMsgBlocks() { 8128 for _, mb := range fs.blks { 8129 mb.mu.RUnlock() 8130 } 8131 } 8132 8133 // Binary encoded state snapshot, >= v2.10 server. 8134 func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) { 8135 fs.mu.RLock() 8136 defer fs.mu.RUnlock() 8137 8138 // Calculate deleted. 8139 var numDeleted int64 8140 if fs.state.LastSeq > fs.state.FirstSeq { 8141 numDeleted = int64(fs.state.LastSeq-fs.state.FirstSeq+1) - int64(fs.state.Msgs) 8142 if numDeleted < 0 { 8143 numDeleted = 0 8144 } 8145 } 8146 8147 // Encoded is Msgs, Bytes, FirstSeq, LastSeq, Failed, NumDeleted and optional DeletedBlocks 8148 var buf [1024]byte 8149 buf[0], buf[1] = streamStateMagic, streamStateVersion 8150 n := hdrLen 8151 n += binary.PutUvarint(buf[n:], fs.state.Msgs) 8152 n += binary.PutUvarint(buf[n:], fs.state.Bytes) 8153 n += binary.PutUvarint(buf[n:], fs.state.FirstSeq) 8154 n += binary.PutUvarint(buf[n:], fs.state.LastSeq) 8155 n += binary.PutUvarint(buf[n:], failed) 8156 n += binary.PutUvarint(buf[n:], uint64(numDeleted)) 8157 8158 b := buf[0:n] 8159 8160 if numDeleted > 0 { 8161 var scratch [4 * 1024]byte 8162 8163 fs.readLockAllMsgBlocks() 8164 defer fs.readUnlockAllMsgBlocks() 8165 8166 for _, db := range fs.deleteBlocks() { 8167 switch db := db.(type) { 8168 case *DeleteRange: 8169 first, _, num := db.State() 8170 scratch[0] = runLengthMagic 8171 i := 1 8172 i += binary.PutUvarint(scratch[i:], first) 8173 i += binary.PutUvarint(scratch[i:], num) 8174 b = append(b, scratch[0:i]...) 8175 case *avl.SequenceSet: 8176 buf, err := db.Encode(scratch[:0]) 8177 if err != nil { 8178 return nil, err 8179 } 8180 b = append(b, buf...) 8181 default: 8182 return nil, errors.New("no impl") 8183 } 8184 } 8185 } 8186 8187 return b, nil 8188 } 8189 8190 // We used to be more sophisticated to save memory, but speed is more important. 8191 // All blocks should be at least read locked. 8192 func (fs *fileStore) deleteBlocks() DeleteBlocks { 8193 var dbs DeleteBlocks 8194 var prevLast uint64 8195 8196 for _, mb := range fs.blks { 8197 // Detect if we have a gap between these blocks. 8198 fseq := atomic.LoadUint64(&mb.first.seq) 8199 if prevLast > 0 && prevLast+1 != fseq { 8200 dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: fseq - prevLast - 1}) 8201 } 8202 if mb.dmap.Size() > 0 { 8203 dbs = append(dbs, &mb.dmap) 8204 } 8205 prevLast = atomic.LoadUint64(&mb.last.seq) 8206 } 8207 return dbs 8208 } 8209 8210 // SyncDeleted will make sure this stream has same deleted state as dbs. 8211 func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) { 8212 if len(dbs) == 0 { 8213 return 8214 } 8215 8216 fs.mu.Lock() 8217 defer fs.mu.Unlock() 8218 8219 var needsCheck DeleteBlocks 8220 8221 fs.readLockAllMsgBlocks() 8222 mdbs := fs.deleteBlocks() 8223 for i, db := range dbs { 8224 // If the block is same as what we have we can skip. 8225 if i < len(mdbs) { 8226 first, last, num := db.State() 8227 eFirst, eLast, eNum := mdbs[i].State() 8228 if first == eFirst && last == eLast && num == eNum { 8229 continue 8230 } 8231 } 8232 // Need to insert these. 8233 needsCheck = append(needsCheck, db) 8234 } 8235 fs.readUnlockAllMsgBlocks() 8236 8237 for _, db := range needsCheck { 8238 db.Range(func(dseq uint64) bool { 8239 fs.removeMsg(dseq, false, true, false) 8240 return true 8241 }) 8242 } 8243 } 8244 8245 //////////////////////////////////////////////////////////////////////////////// 8246 // Consumers 8247 //////////////////////////////////////////////////////////////////////////////// 8248 8249 type consumerFileStore struct { 8250 mu sync.Mutex 8251 fs *fileStore 8252 cfg *FileConsumerInfo 8253 prf keyGen 8254 aek cipher.AEAD 8255 name string 8256 odir string 8257 ifn string 8258 hh hash.Hash64 8259 state ConsumerState 8260 fch chan struct{} 8261 qch chan struct{} 8262 flusher bool 8263 writing bool 8264 dirty bool 8265 closed bool 8266 } 8267 8268 func (fs *fileStore) ConsumerStore(name string, cfg *ConsumerConfig) (ConsumerStore, error) { 8269 if fs == nil { 8270 return nil, fmt.Errorf("filestore is nil") 8271 } 8272 if fs.isClosed() { 8273 return nil, ErrStoreClosed 8274 } 8275 if cfg == nil || name == _EMPTY_ { 8276 return nil, fmt.Errorf("bad consumer config") 8277 } 8278 8279 // We now allow overrides from a stream being a filestore type and forcing a consumer to be memory store. 8280 if cfg.MemoryStorage { 8281 // Create directly here. 8282 o := &consumerMemStore{ms: fs, cfg: *cfg} 8283 fs.AddConsumer(o) 8284 return o, nil 8285 } 8286 8287 odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, name) 8288 if err := os.MkdirAll(odir, defaultDirPerms); err != nil { 8289 return nil, fmt.Errorf("could not create consumer directory - %v", err) 8290 } 8291 csi := &FileConsumerInfo{Name: name, Created: time.Now().UTC(), ConsumerConfig: *cfg} 8292 o := &consumerFileStore{ 8293 fs: fs, 8294 cfg: csi, 8295 prf: fs.prf, 8296 name: name, 8297 odir: odir, 8298 ifn: filepath.Join(odir, consumerState), 8299 } 8300 key := sha256.Sum256([]byte(fs.cfg.Name + "/" + name)) 8301 hh, err := highwayhash.New64(key[:]) 8302 if err != nil { 8303 return nil, fmt.Errorf("could not create hash: %v", err) 8304 } 8305 o.hh = hh 8306 8307 // Check for encryption. 8308 if o.prf != nil { 8309 if ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)); err == nil { 8310 if len(ekey) < minBlkKeySize { 8311 return nil, errBadKeySize 8312 } 8313 // Recover key encryption key. 8314 rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name)) 8315 if err != nil { 8316 return nil, err 8317 } 8318 8319 sc := fs.fcfg.Cipher 8320 kek, err := genEncryptionKey(sc, rb) 8321 if err != nil { 8322 return nil, err 8323 } 8324 ns := kek.NonceSize() 8325 nonce := ekey[:ns] 8326 seed, err := kek.Open(nil, nonce, ekey[ns:], nil) 8327 if err != nil { 8328 // We may be here on a cipher conversion, so attempt to convert. 8329 if err = o.convertCipher(); err != nil { 8330 return nil, err 8331 } 8332 } else { 8333 o.aek, err = genEncryptionKey(sc, seed) 8334 } 8335 if err != nil { 8336 return nil, err 8337 } 8338 } 8339 } 8340 8341 // Track if we are creating the directory so that we can clean up if we encounter an error. 8342 var didCreate bool 8343 8344 // Write our meta data iff does not exist. 8345 meta := filepath.Join(odir, JetStreamMetaFile) 8346 if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) { 8347 didCreate = true 8348 csi.Created = time.Now().UTC() 8349 if err := o.writeConsumerMeta(); err != nil { 8350 os.RemoveAll(odir) 8351 return nil, err 8352 } 8353 } 8354 8355 // If we expect to be encrypted check that what we are restoring is not plaintext. 8356 // This can happen on snapshot restores or conversions. 8357 if o.prf != nil { 8358 keyFile := filepath.Join(odir, JetStreamMetaFileKey) 8359 if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) { 8360 if err := o.writeConsumerMeta(); err != nil { 8361 if didCreate { 8362 os.RemoveAll(odir) 8363 } 8364 return nil, err 8365 } 8366 // Redo the state file as well here if we have one and we can tell it was plaintext. 8367 if buf, err := os.ReadFile(o.ifn); err == nil { 8368 if _, err := decodeConsumerState(buf); err == nil { 8369 state, err := o.encryptState(buf) 8370 if err != nil { 8371 return nil, err 8372 } 8373 <-dios 8374 err = os.WriteFile(o.ifn, state, defaultFilePerms) 8375 dios <- struct{}{} 8376 if err != nil { 8377 if didCreate { 8378 os.RemoveAll(odir) 8379 } 8380 return nil, err 8381 } 8382 } 8383 } 8384 } 8385 } 8386 8387 // Create channels to control our flush go routine. 8388 o.fch = make(chan struct{}, 1) 8389 o.qch = make(chan struct{}) 8390 go o.flushLoop(o.fch, o.qch) 8391 8392 // Make sure to load in our state from disk if needed. 8393 o.loadState() 8394 8395 // Assign to filestore. 8396 fs.AddConsumer(o) 8397 8398 return o, nil 8399 } 8400 8401 func (o *consumerFileStore) convertCipher() error { 8402 fs := o.fs 8403 odir := filepath.Join(fs.fcfg.StoreDir, consumerDir, o.name) 8404 8405 ekey, err := os.ReadFile(filepath.Join(odir, JetStreamMetaFileKey)) 8406 if err != nil { 8407 return err 8408 } 8409 if len(ekey) < minBlkKeySize { 8410 return errBadKeySize 8411 } 8412 // Recover key encryption key. 8413 rb, err := fs.prf([]byte(fs.cfg.Name + tsep + o.name)) 8414 if err != nil { 8415 return err 8416 } 8417 8418 // Do these in reverse since converting. 8419 sc := fs.fcfg.Cipher 8420 osc := AES 8421 if sc == AES { 8422 osc = ChaCha 8423 } 8424 kek, err := genEncryptionKey(osc, rb) 8425 if err != nil { 8426 return err 8427 } 8428 ns := kek.NonceSize() 8429 nonce := ekey[:ns] 8430 seed, err := kek.Open(nil, nonce, ekey[ns:], nil) 8431 if err != nil { 8432 return err 8433 } 8434 aek, err := genEncryptionKey(osc, seed) 8435 if err != nil { 8436 return err 8437 } 8438 // Now read in and decode our state using the old cipher. 8439 buf, err := os.ReadFile(o.ifn) 8440 if err != nil { 8441 return err 8442 } 8443 buf, err = aek.Open(nil, buf[:ns], buf[ns:], nil) 8444 if err != nil { 8445 return err 8446 } 8447 8448 // Since we are here we recovered our old state. 8449 // Now write our meta, which will generate the new keys with the new cipher. 8450 if err := o.writeConsumerMeta(); err != nil { 8451 return err 8452 } 8453 8454 // Now write out or state with the new cipher. 8455 return o.writeState(buf) 8456 } 8457 8458 // Kick flusher for this consumer. 8459 // Lock should be held. 8460 func (o *consumerFileStore) kickFlusher() { 8461 if o.fch != nil { 8462 select { 8463 case o.fch <- struct{}{}: 8464 default: 8465 } 8466 } 8467 o.dirty = true 8468 } 8469 8470 // Set in flusher status 8471 func (o *consumerFileStore) setInFlusher() { 8472 o.mu.Lock() 8473 o.flusher = true 8474 o.mu.Unlock() 8475 } 8476 8477 // Clear in flusher status 8478 func (o *consumerFileStore) clearInFlusher() { 8479 o.mu.Lock() 8480 o.flusher = false 8481 o.mu.Unlock() 8482 } 8483 8484 // Report in flusher status 8485 func (o *consumerFileStore) inFlusher() bool { 8486 o.mu.Lock() 8487 defer o.mu.Unlock() 8488 return o.flusher 8489 } 8490 8491 // flushLoop watches for consumer updates and the quit channel. 8492 func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) { 8493 8494 o.setInFlusher() 8495 defer o.clearInFlusher() 8496 8497 // Maintain approximately 10 updates per second per consumer under load. 8498 const minTime = 100 * time.Millisecond 8499 var lastWrite time.Time 8500 var dt *time.Timer 8501 8502 setDelayTimer := func(addWait time.Duration) { 8503 if dt == nil { 8504 dt = time.NewTimer(addWait) 8505 return 8506 } 8507 if !dt.Stop() { 8508 select { 8509 case <-dt.C: 8510 default: 8511 } 8512 } 8513 dt.Reset(addWait) 8514 } 8515 8516 for { 8517 select { 8518 case <-fch: 8519 if ts := time.Since(lastWrite); ts < minTime { 8520 setDelayTimer(minTime - ts) 8521 select { 8522 case <-dt.C: 8523 case <-qch: 8524 return 8525 } 8526 } 8527 o.mu.Lock() 8528 if o.closed { 8529 o.mu.Unlock() 8530 return 8531 } 8532 buf, err := o.encodeState() 8533 o.mu.Unlock() 8534 if err != nil { 8535 return 8536 } 8537 // TODO(dlc) - if we error should start failing upwards. 8538 if err := o.writeState(buf); err == nil { 8539 lastWrite = time.Now() 8540 } 8541 case <-qch: 8542 return 8543 } 8544 } 8545 } 8546 8547 // SetStarting sets our starting stream sequence. 8548 func (o *consumerFileStore) SetStarting(sseq uint64) error { 8549 o.mu.Lock() 8550 o.state.Delivered.Stream = sseq 8551 buf, err := o.encodeState() 8552 o.mu.Unlock() 8553 if err != nil { 8554 return err 8555 } 8556 return o.writeState(buf) 8557 } 8558 8559 // HasState returns if this store has a recorded state. 8560 func (o *consumerFileStore) HasState() bool { 8561 o.mu.Lock() 8562 _, err := os.Stat(o.ifn) 8563 o.mu.Unlock() 8564 return err == nil 8565 } 8566 8567 // UpdateDelivered is called whenever a new message has been delivered. 8568 func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) error { 8569 o.mu.Lock() 8570 defer o.mu.Unlock() 8571 8572 if dc != 1 && o.cfg.AckPolicy == AckNone { 8573 return ErrNoAckPolicy 8574 } 8575 8576 // On restarts the old leader may get a replay from the raft logs that are old. 8577 if dseq <= o.state.AckFloor.Consumer { 8578 return nil 8579 } 8580 8581 // See if we expect an ack for this. 8582 if o.cfg.AckPolicy != AckNone { 8583 // Need to create pending records here. 8584 if o.state.Pending == nil { 8585 o.state.Pending = make(map[uint64]*Pending) 8586 } 8587 var p *Pending 8588 // Check for an update to a message already delivered. 8589 if sseq <= o.state.Delivered.Stream { 8590 if p = o.state.Pending[sseq]; p != nil { 8591 p.Sequence, p.Timestamp = dseq, ts 8592 } 8593 } else { 8594 // Add to pending. 8595 o.state.Pending[sseq] = &Pending{dseq, ts} 8596 } 8597 // Update delivered as needed. 8598 if dseq > o.state.Delivered.Consumer { 8599 o.state.Delivered.Consumer = dseq 8600 } 8601 if sseq > o.state.Delivered.Stream { 8602 o.state.Delivered.Stream = sseq 8603 } 8604 8605 if dc > 1 { 8606 if maxdc := uint64(o.cfg.MaxDeliver); maxdc > 0 && dc > maxdc { 8607 // Make sure to remove from pending. 8608 delete(o.state.Pending, sseq) 8609 } 8610 if o.state.Redelivered == nil { 8611 o.state.Redelivered = make(map[uint64]uint64) 8612 } 8613 // Only update if greater then what we already have. 8614 if o.state.Redelivered[sseq] < dc-1 { 8615 o.state.Redelivered[sseq] = dc - 1 8616 } 8617 } 8618 } else { 8619 // For AckNone just update delivered and ackfloor at the same time. 8620 if dseq > o.state.Delivered.Consumer { 8621 o.state.Delivered.Consumer = dseq 8622 o.state.AckFloor.Consumer = dseq 8623 } 8624 if sseq > o.state.Delivered.Stream { 8625 o.state.Delivered.Stream = sseq 8626 o.state.AckFloor.Stream = sseq 8627 } 8628 } 8629 // Make sure we flush to disk. 8630 o.kickFlusher() 8631 8632 return nil 8633 } 8634 8635 // UpdateAcks is called whenever a consumer with explicit ack or ack all acks a message. 8636 func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error { 8637 o.mu.Lock() 8638 defer o.mu.Unlock() 8639 8640 if o.cfg.AckPolicy == AckNone { 8641 return ErrNoAckPolicy 8642 } 8643 8644 // On restarts the old leader may get a replay from the raft logs that are old. 8645 if dseq <= o.state.AckFloor.Consumer { 8646 return nil 8647 } 8648 8649 if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil { 8650 return ErrStoreMsgNotFound 8651 } 8652 8653 // Check for AckAll here. 8654 if o.cfg.AckPolicy == AckAll { 8655 sgap := sseq - o.state.AckFloor.Stream 8656 o.state.AckFloor.Consumer = dseq 8657 o.state.AckFloor.Stream = sseq 8658 for seq := sseq; seq > sseq-sgap; seq-- { 8659 delete(o.state.Pending, seq) 8660 if len(o.state.Redelivered) > 0 { 8661 delete(o.state.Redelivered, seq) 8662 } 8663 } 8664 o.kickFlusher() 8665 return nil 8666 } 8667 8668 // AckExplicit 8669 8670 // First delete from our pending state. 8671 if p, ok := o.state.Pending[sseq]; ok { 8672 delete(o.state.Pending, sseq) 8673 dseq = p.Sequence // Use the original. 8674 } 8675 if len(o.state.Pending) == 0 { 8676 o.state.AckFloor.Consumer = o.state.Delivered.Consumer 8677 o.state.AckFloor.Stream = o.state.Delivered.Stream 8678 } else if dseq == o.state.AckFloor.Consumer+1 { 8679 o.state.AckFloor.Consumer = dseq 8680 o.state.AckFloor.Stream = sseq 8681 8682 if o.state.Delivered.Consumer > dseq { 8683 for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ { 8684 if p, ok := o.state.Pending[ss]; ok { 8685 if p.Sequence > 0 { 8686 o.state.AckFloor.Consumer = p.Sequence - 1 8687 o.state.AckFloor.Stream = ss - 1 8688 } 8689 break 8690 } 8691 } 8692 } 8693 } 8694 // We do these regardless. 8695 delete(o.state.Redelivered, sseq) 8696 8697 o.kickFlusher() 8698 return nil 8699 } 8700 8701 const seqsHdrSize = 6*binary.MaxVarintLen64 + hdrLen 8702 8703 // Encode our consumer state, version 2. 8704 // Lock should be held. 8705 8706 func (o *consumerFileStore) EncodedState() ([]byte, error) { 8707 o.mu.Lock() 8708 defer o.mu.Unlock() 8709 return o.encodeState() 8710 } 8711 8712 func (o *consumerFileStore) encodeState() ([]byte, error) { 8713 // Grab reference to state, but make sure we load in if needed, so do not reference o.state directly. 8714 state, err := o.stateWithCopyLocked(false) 8715 if err != nil { 8716 return nil, err 8717 } 8718 return encodeConsumerState(state), nil 8719 } 8720 8721 func (o *consumerFileStore) UpdateConfig(cfg *ConsumerConfig) error { 8722 o.mu.Lock() 8723 defer o.mu.Unlock() 8724 8725 // This is mostly unchecked here. We are assuming the upper layers have done sanity checking. 8726 csi := o.cfg 8727 csi.ConsumerConfig = *cfg 8728 8729 return o.writeConsumerMeta() 8730 } 8731 8732 func (o *consumerFileStore) Update(state *ConsumerState) error { 8733 o.mu.Lock() 8734 defer o.mu.Unlock() 8735 8736 // Check to see if this is an outdated update. 8737 if state.Delivered.Consumer < o.state.Delivered.Consumer || state.AckFloor.Stream < o.state.AckFloor.Stream { 8738 return nil 8739 } 8740 8741 // Sanity checks. 8742 if state.AckFloor.Consumer > state.Delivered.Consumer { 8743 return fmt.Errorf("bad ack floor for consumer") 8744 } 8745 if state.AckFloor.Stream > state.Delivered.Stream { 8746 return fmt.Errorf("bad ack floor for stream") 8747 } 8748 8749 // Copy to our state. 8750 var pending map[uint64]*Pending 8751 var redelivered map[uint64]uint64 8752 if len(state.Pending) > 0 { 8753 pending = make(map[uint64]*Pending, len(state.Pending)) 8754 for seq, p := range state.Pending { 8755 pending[seq] = &Pending{p.Sequence, p.Timestamp} 8756 if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream { 8757 return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq) 8758 } 8759 } 8760 } 8761 if len(state.Redelivered) > 0 { 8762 redelivered = make(map[uint64]uint64, len(state.Redelivered)) 8763 for seq, dc := range state.Redelivered { 8764 redelivered[seq] = dc 8765 } 8766 } 8767 8768 o.state.Delivered = state.Delivered 8769 o.state.AckFloor = state.AckFloor 8770 o.state.Pending = pending 8771 o.state.Redelivered = redelivered 8772 8773 o.kickFlusher() 8774 8775 return nil 8776 } 8777 8778 // Will encrypt the state with our asset key. Will be a no-op if encryption not enabled. 8779 // Lock should be held. 8780 func (o *consumerFileStore) encryptState(buf []byte) ([]byte, error) { 8781 if o.aek == nil { 8782 return buf, nil 8783 } 8784 // TODO(dlc) - Optimize on space usage a bit? 8785 nonce := make([]byte, o.aek.NonceSize(), o.aek.NonceSize()+len(buf)+o.aek.Overhead()) 8786 if n, err := rand.Read(nonce); err != nil { 8787 return nil, err 8788 } else if n != len(nonce) { 8789 return nil, fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce)) 8790 } 8791 return o.aek.Seal(nonce, nonce, buf, nil), nil 8792 } 8793 8794 // Used to limit number of disk IO calls in flight since they could all be blocking an OS thread. 8795 // https://github.com/nats-io/nats-server/issues/2742 8796 var dios chan struct{} 8797 8798 // Used to setup our simplistic counting semaphore using buffered channels. 8799 // golang.org's semaphore seemed a bit heavy. 8800 func init() { 8801 // Limit ourselves to a max of 4 blocking IO calls. 8802 const nIO = 4 8803 dios = make(chan struct{}, nIO) 8804 // Fill it up to start. 8805 for i := 0; i < nIO; i++ { 8806 dios <- struct{}{} 8807 } 8808 } 8809 8810 func (o *consumerFileStore) writeState(buf []byte) error { 8811 // Check if we have the index file open. 8812 o.mu.Lock() 8813 if o.writing || len(buf) == 0 { 8814 o.mu.Unlock() 8815 return nil 8816 } 8817 8818 // Check on encryption. 8819 if o.aek != nil { 8820 var err error 8821 if buf, err = o.encryptState(buf); err != nil { 8822 return err 8823 } 8824 } 8825 8826 o.writing = true 8827 o.dirty = false 8828 ifn := o.ifn 8829 o.mu.Unlock() 8830 8831 // Lock not held here but we do limit number of outstanding calls that could block OS threads. 8832 <-dios 8833 err := os.WriteFile(ifn, buf, defaultFilePerms) 8834 dios <- struct{}{} 8835 8836 o.mu.Lock() 8837 if err != nil { 8838 o.dirty = true 8839 } 8840 o.writing = false 8841 o.mu.Unlock() 8842 8843 return err 8844 } 8845 8846 // Will upodate the config. Only used when recovering ephemerals. 8847 func (o *consumerFileStore) updateConfig(cfg ConsumerConfig) error { 8848 o.mu.Lock() 8849 defer o.mu.Unlock() 8850 o.cfg = &FileConsumerInfo{ConsumerConfig: cfg} 8851 return o.writeConsumerMeta() 8852 } 8853 8854 // Write out the consumer meta data, i.e. state. 8855 // Lock should be held. 8856 func (cfs *consumerFileStore) writeConsumerMeta() error { 8857 meta := filepath.Join(cfs.odir, JetStreamMetaFile) 8858 if _, err := os.Stat(meta); err != nil && !os.IsNotExist(err) { 8859 return err 8860 } 8861 8862 if cfs.prf != nil && cfs.aek == nil { 8863 fs := cfs.fs 8864 key, _, _, encrypted, err := fs.genEncryptionKeys(fs.cfg.Name + tsep + cfs.name) 8865 if err != nil { 8866 return err 8867 } 8868 cfs.aek = key 8869 keyFile := filepath.Join(cfs.odir, JetStreamMetaFileKey) 8870 if _, err := os.Stat(keyFile); err != nil && !os.IsNotExist(err) { 8871 return err 8872 } 8873 <-dios 8874 err = os.WriteFile(keyFile, encrypted, defaultFilePerms) 8875 dios <- struct{}{} 8876 if err != nil { 8877 return err 8878 } 8879 } 8880 8881 b, err := json.Marshal(cfs.cfg) 8882 if err != nil { 8883 return err 8884 } 8885 // Encrypt if needed. 8886 if cfs.aek != nil { 8887 nonce := make([]byte, cfs.aek.NonceSize(), cfs.aek.NonceSize()+len(b)+cfs.aek.Overhead()) 8888 if n, err := rand.Read(nonce); err != nil { 8889 return err 8890 } else if n != len(nonce) { 8891 return fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce)) 8892 } 8893 b = cfs.aek.Seal(nonce, nonce, b, nil) 8894 } 8895 8896 <-dios 8897 err = os.WriteFile(meta, b, defaultFilePerms) 8898 dios <- struct{}{} 8899 if err != nil { 8900 return err 8901 } 8902 cfs.hh.Reset() 8903 cfs.hh.Write(b) 8904 checksum := hex.EncodeToString(cfs.hh.Sum(nil)) 8905 sum := filepath.Join(cfs.odir, JetStreamMetaFileSum) 8906 8907 <-dios 8908 err = os.WriteFile(sum, []byte(checksum), defaultFilePerms) 8909 dios <- struct{}{} 8910 if err != nil { 8911 return err 8912 } 8913 return nil 8914 } 8915 8916 // Consumer version. 8917 func checkConsumerHeader(hdr []byte) (uint8, error) { 8918 if hdr == nil || len(hdr) < 2 || hdr[0] != magic { 8919 return 0, errCorruptState 8920 } 8921 version := hdr[1] 8922 switch version { 8923 case 1, 2: 8924 return version, nil 8925 } 8926 return 0, fmt.Errorf("unsupported version: %d", version) 8927 } 8928 8929 func (o *consumerFileStore) copyPending() map[uint64]*Pending { 8930 pending := make(map[uint64]*Pending, len(o.state.Pending)) 8931 for seq, p := range o.state.Pending { 8932 pending[seq] = &Pending{p.Sequence, p.Timestamp} 8933 } 8934 return pending 8935 } 8936 8937 func (o *consumerFileStore) copyRedelivered() map[uint64]uint64 { 8938 redelivered := make(map[uint64]uint64, len(o.state.Redelivered)) 8939 for seq, dc := range o.state.Redelivered { 8940 redelivered[seq] = dc 8941 } 8942 return redelivered 8943 } 8944 8945 // Type returns the type of the underlying store. 8946 func (o *consumerFileStore) Type() StorageType { return FileStorage } 8947 8948 // State retrieves the state from the state file. 8949 // This is not expected to be called in high performance code, only on startup. 8950 func (o *consumerFileStore) State() (*ConsumerState, error) { 8951 return o.stateWithCopy(true) 8952 } 8953 8954 // This will not copy pending or redelivered, so should only be done under the 8955 // consumer owner's lock. 8956 func (o *consumerFileStore) BorrowState() (*ConsumerState, error) { 8957 return o.stateWithCopy(false) 8958 } 8959 8960 func (o *consumerFileStore) stateWithCopy(doCopy bool) (*ConsumerState, error) { 8961 o.mu.Lock() 8962 defer o.mu.Unlock() 8963 return o.stateWithCopyLocked(doCopy) 8964 } 8965 8966 // Lock should be held. 8967 func (o *consumerFileStore) stateWithCopyLocked(doCopy bool) (*ConsumerState, error) { 8968 if o.closed { 8969 return nil, ErrStoreClosed 8970 } 8971 8972 state := &ConsumerState{} 8973 8974 // See if we have a running state or if we need to read in from disk. 8975 if o.state.Delivered.Consumer != 0 || o.state.Delivered.Stream != 0 { 8976 state.Delivered = o.state.Delivered 8977 state.AckFloor = o.state.AckFloor 8978 if len(o.state.Pending) > 0 { 8979 if doCopy { 8980 state.Pending = o.copyPending() 8981 } else { 8982 state.Pending = o.state.Pending 8983 } 8984 } 8985 if len(o.state.Redelivered) > 0 { 8986 if doCopy { 8987 state.Redelivered = o.copyRedelivered() 8988 } else { 8989 state.Redelivered = o.state.Redelivered 8990 } 8991 } 8992 return state, nil 8993 } 8994 8995 // Read the state in here from disk.. 8996 <-dios 8997 buf, err := os.ReadFile(o.ifn) 8998 dios <- struct{}{} 8999 9000 if err != nil && !os.IsNotExist(err) { 9001 return nil, err 9002 } 9003 9004 if len(buf) == 0 { 9005 return state, nil 9006 } 9007 9008 // Check on encryption. 9009 if o.aek != nil { 9010 ns := o.aek.NonceSize() 9011 buf, err = o.aek.Open(nil, buf[:ns], buf[ns:], nil) 9012 if err != nil { 9013 return nil, err 9014 } 9015 } 9016 9017 state, err = decodeConsumerState(buf) 9018 if err != nil { 9019 return nil, err 9020 } 9021 9022 // Copy this state into our own. 9023 o.state.Delivered = state.Delivered 9024 o.state.AckFloor = state.AckFloor 9025 if len(state.Pending) > 0 { 9026 if doCopy { 9027 o.state.Pending = make(map[uint64]*Pending, len(state.Pending)) 9028 for seq, p := range state.Pending { 9029 o.state.Pending[seq] = &Pending{p.Sequence, p.Timestamp} 9030 } 9031 } else { 9032 o.state.Pending = state.Pending 9033 } 9034 } 9035 if len(state.Redelivered) > 0 { 9036 if doCopy { 9037 o.state.Redelivered = make(map[uint64]uint64, len(state.Redelivered)) 9038 for seq, dc := range state.Redelivered { 9039 o.state.Redelivered[seq] = dc 9040 } 9041 } else { 9042 o.state.Redelivered = state.Redelivered 9043 } 9044 } 9045 9046 return state, nil 9047 } 9048 9049 // Lock should be held. Called at startup. 9050 func (o *consumerFileStore) loadState() { 9051 if _, err := os.Stat(o.ifn); err == nil { 9052 // This will load our state in from disk. 9053 o.stateWithCopyLocked(false) 9054 } 9055 } 9056 9057 // Decode consumer state. 9058 func decodeConsumerState(buf []byte) (*ConsumerState, error) { 9059 version, err := checkConsumerHeader(buf) 9060 if err != nil { 9061 return nil, err 9062 } 9063 9064 bi := hdrLen 9065 // Helpers, will set i to -1 on error. 9066 readSeq := func() uint64 { 9067 if bi < 0 { 9068 return 0 9069 } 9070 seq, n := binary.Uvarint(buf[bi:]) 9071 if n <= 0 { 9072 bi = -1 9073 return 0 9074 } 9075 bi += n 9076 return seq 9077 } 9078 readTimeStamp := func() int64 { 9079 if bi < 0 { 9080 return 0 9081 } 9082 ts, n := binary.Varint(buf[bi:]) 9083 if n <= 0 { 9084 bi = -1 9085 return -1 9086 } 9087 bi += n 9088 return ts 9089 } 9090 // Just for clarity below. 9091 readLen := readSeq 9092 readCount := readSeq 9093 9094 state := &ConsumerState{} 9095 state.AckFloor.Consumer = readSeq() 9096 state.AckFloor.Stream = readSeq() 9097 state.Delivered.Consumer = readSeq() 9098 state.Delivered.Stream = readSeq() 9099 9100 if bi == -1 { 9101 return nil, errCorruptState 9102 } 9103 if version == 1 { 9104 // Adjust back. Version 1 also stored delivered as next to be delivered, 9105 // so adjust that back down here. 9106 if state.AckFloor.Consumer > 1 { 9107 state.Delivered.Consumer += state.AckFloor.Consumer - 1 9108 } 9109 if state.AckFloor.Stream > 1 { 9110 state.Delivered.Stream += state.AckFloor.Stream - 1 9111 } 9112 } 9113 9114 // Protect ourselves against rolling backwards. 9115 const hbit = 1 << 63 9116 if state.AckFloor.Stream&hbit != 0 || state.Delivered.Stream&hbit != 0 { 9117 return nil, errCorruptState 9118 } 9119 9120 // We have additional stuff. 9121 if numPending := readLen(); numPending > 0 { 9122 mints := readTimeStamp() 9123 state.Pending = make(map[uint64]*Pending, numPending) 9124 for i := 0; i < int(numPending); i++ { 9125 sseq := readSeq() 9126 var dseq uint64 9127 if version == 2 { 9128 dseq = readSeq() 9129 } 9130 ts := readTimeStamp() 9131 // Check the state machine for corruption, not the value which could be -1. 9132 if bi == -1 { 9133 return nil, errCorruptState 9134 } 9135 // Adjust seq back. 9136 sseq += state.AckFloor.Stream 9137 if sseq == 0 { 9138 return nil, errCorruptState 9139 } 9140 if version == 2 { 9141 dseq += state.AckFloor.Consumer 9142 } 9143 // Adjust the timestamp back. 9144 if version == 1 { 9145 ts = (ts + mints) * int64(time.Second) 9146 } else { 9147 ts = (mints - ts) * int64(time.Second) 9148 } 9149 // Store in pending. 9150 state.Pending[sseq] = &Pending{dseq, ts} 9151 } 9152 } 9153 9154 // We have redelivered entries here. 9155 if numRedelivered := readLen(); numRedelivered > 0 { 9156 state.Redelivered = make(map[uint64]uint64, numRedelivered) 9157 for i := 0; i < int(numRedelivered); i++ { 9158 if seq, n := readSeq(), readCount(); seq > 0 && n > 0 { 9159 // Adjust seq back. 9160 seq += state.AckFloor.Stream 9161 state.Redelivered[seq] = n 9162 } 9163 } 9164 } 9165 9166 return state, nil 9167 } 9168 9169 // Stop the processing of the consumers's state. 9170 func (o *consumerFileStore) Stop() error { 9171 o.mu.Lock() 9172 if o.closed { 9173 o.mu.Unlock() 9174 return nil 9175 } 9176 if o.qch != nil { 9177 close(o.qch) 9178 o.qch = nil 9179 } 9180 9181 var err error 9182 var buf []byte 9183 9184 if o.dirty { 9185 // Make sure to write this out.. 9186 if buf, err = o.encodeState(); err == nil && len(buf) > 0 { 9187 if o.aek != nil { 9188 if buf, err = o.encryptState(buf); err != nil { 9189 return err 9190 } 9191 } 9192 } 9193 } 9194 9195 o.odir = _EMPTY_ 9196 o.closed = true 9197 ifn, fs := o.ifn, o.fs 9198 o.mu.Unlock() 9199 9200 fs.RemoveConsumer(o) 9201 9202 if len(buf) > 0 { 9203 o.waitOnFlusher() 9204 <-dios 9205 err = os.WriteFile(ifn, buf, defaultFilePerms) 9206 dios <- struct{}{} 9207 } 9208 return err 9209 } 9210 9211 func (o *consumerFileStore) waitOnFlusher() { 9212 if !o.inFlusher() { 9213 return 9214 } 9215 9216 timeout := time.Now().Add(100 * time.Millisecond) 9217 for time.Now().Before(timeout) { 9218 if !o.inFlusher() { 9219 return 9220 } 9221 time.Sleep(10 * time.Millisecond) 9222 } 9223 } 9224 9225 // Delete the consumer. 9226 func (o *consumerFileStore) Delete() error { 9227 return o.delete(false) 9228 } 9229 9230 func (o *consumerFileStore) StreamDelete() error { 9231 return o.delete(true) 9232 } 9233 9234 func (o *consumerFileStore) delete(streamDeleted bool) error { 9235 o.mu.Lock() 9236 if o.closed { 9237 o.mu.Unlock() 9238 return nil 9239 } 9240 if o.qch != nil { 9241 close(o.qch) 9242 o.qch = nil 9243 } 9244 9245 var err error 9246 odir := o.odir 9247 o.odir = _EMPTY_ 9248 o.closed = true 9249 fs := o.fs 9250 o.mu.Unlock() 9251 9252 // If our stream was not deleted this will remove the directories. 9253 if odir != _EMPTY_ && !streamDeleted { 9254 <-dios 9255 err = os.RemoveAll(odir) 9256 dios <- struct{}{} 9257 } 9258 9259 if !streamDeleted { 9260 fs.RemoveConsumer(o) 9261 } 9262 9263 return err 9264 } 9265 9266 func (fs *fileStore) AddConsumer(o ConsumerStore) error { 9267 fs.cmu.Lock() 9268 defer fs.cmu.Unlock() 9269 fs.cfs = append(fs.cfs, o) 9270 return nil 9271 } 9272 9273 func (fs *fileStore) RemoveConsumer(o ConsumerStore) error { 9274 fs.cmu.Lock() 9275 defer fs.cmu.Unlock() 9276 for i, cfs := range fs.cfs { 9277 if o == cfs { 9278 fs.cfs = append(fs.cfs[:i], fs.cfs[i+1:]...) 9279 break 9280 } 9281 } 9282 return nil 9283 } 9284 9285 //////////////////////////////////////////////////////////////////////////////// 9286 // Templates 9287 //////////////////////////////////////////////////////////////////////////////// 9288 9289 type templateFileStore struct { 9290 dir string 9291 hh hash.Hash64 9292 } 9293 9294 func newTemplateFileStore(storeDir string) *templateFileStore { 9295 tdir := filepath.Join(storeDir, tmplsDir) 9296 key := sha256.Sum256([]byte("templates")) 9297 hh, err := highwayhash.New64(key[:]) 9298 if err != nil { 9299 return nil 9300 } 9301 return &templateFileStore{dir: tdir, hh: hh} 9302 } 9303 9304 func (ts *templateFileStore) Store(t *streamTemplate) error { 9305 dir := filepath.Join(ts.dir, t.Name) 9306 if err := os.MkdirAll(dir, defaultDirPerms); err != nil { 9307 return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err) 9308 } 9309 meta := filepath.Join(dir, JetStreamMetaFile) 9310 if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil { 9311 return err 9312 } 9313 t.mu.Lock() 9314 b, err := json.Marshal(t) 9315 t.mu.Unlock() 9316 if err != nil { 9317 return err 9318 } 9319 if err := os.WriteFile(meta, b, defaultFilePerms); err != nil { 9320 return err 9321 } 9322 // FIXME(dlc) - Do checksum 9323 ts.hh.Reset() 9324 ts.hh.Write(b) 9325 checksum := hex.EncodeToString(ts.hh.Sum(nil)) 9326 sum := filepath.Join(dir, JetStreamMetaFileSum) 9327 if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil { 9328 return err 9329 } 9330 return nil 9331 } 9332 9333 func (ts *templateFileStore) Delete(t *streamTemplate) error { 9334 return os.RemoveAll(filepath.Join(ts.dir, t.Name)) 9335 } 9336 9337 //////////////////////////////////////////////////////////////////////////////// 9338 // Compression 9339 //////////////////////////////////////////////////////////////////////////////// 9340 9341 type CompressionInfo struct { 9342 Algorithm StoreCompression 9343 OriginalSize uint64 9344 } 9345 9346 func (c *CompressionInfo) MarshalMetadata() []byte { 9347 b := make([]byte, 14) // 4 + potentially up to 10 for uint64 9348 b[0], b[1], b[2] = 'c', 'm', 'p' 9349 b[3] = byte(c.Algorithm) 9350 n := binary.PutUvarint(b[4:], c.OriginalSize) 9351 return b[:4+n] 9352 } 9353 9354 func (c *CompressionInfo) UnmarshalMetadata(b []byte) (int, error) { 9355 c.Algorithm = NoCompression 9356 c.OriginalSize = 0 9357 if len(b) < 5 { // 4 + min 1 for uvarint uint64 9358 return 0, nil 9359 } 9360 if b[0] != 'c' || b[1] != 'm' || b[2] != 'p' { 9361 return 0, nil 9362 } 9363 var n int 9364 c.Algorithm = StoreCompression(b[3]) 9365 c.OriginalSize, n = binary.Uvarint(b[4:]) 9366 if n <= 0 { 9367 return 0, fmt.Errorf("metadata incomplete") 9368 } 9369 return 4 + n, nil 9370 } 9371 9372 func (alg StoreCompression) Compress(buf []byte) ([]byte, error) { 9373 if len(buf) < checksumSize { 9374 return nil, fmt.Errorf("uncompressed buffer is too short") 9375 } 9376 bodyLen := int64(len(buf) - checksumSize) 9377 var output bytes.Buffer 9378 var writer io.WriteCloser 9379 switch alg { 9380 case NoCompression: 9381 return buf, nil 9382 case S2Compression: 9383 writer = s2.NewWriter(&output) 9384 default: 9385 return nil, fmt.Errorf("compression algorithm not known") 9386 } 9387 9388 input := bytes.NewReader(buf[:bodyLen]) 9389 checksum := buf[bodyLen:] 9390 9391 // Compress the block content, but don't compress the checksum. 9392 // We will preserve it at the end of the block as-is. 9393 if n, err := io.CopyN(writer, input, bodyLen); err != nil { 9394 return nil, fmt.Errorf("error writing to compression writer: %w", err) 9395 } else if n != bodyLen { 9396 return nil, fmt.Errorf("short write on body (%d != %d)", n, bodyLen) 9397 } 9398 if err := writer.Close(); err != nil { 9399 return nil, fmt.Errorf("error closing compression writer: %w", err) 9400 } 9401 9402 // Now add the checksum back onto the end of the block. 9403 if n, err := output.Write(checksum); err != nil { 9404 return nil, fmt.Errorf("error writing checksum: %w", err) 9405 } else if n != checksumSize { 9406 return nil, fmt.Errorf("short write on checksum (%d != %d)", n, checksumSize) 9407 } 9408 9409 return output.Bytes(), nil 9410 } 9411 9412 func (alg StoreCompression) Decompress(buf []byte) ([]byte, error) { 9413 if len(buf) < checksumSize { 9414 return nil, fmt.Errorf("compressed buffer is too short") 9415 } 9416 bodyLen := int64(len(buf) - checksumSize) 9417 input := bytes.NewReader(buf[:bodyLen]) 9418 9419 var reader io.ReadCloser 9420 switch alg { 9421 case NoCompression: 9422 return buf, nil 9423 case S2Compression: 9424 reader = io.NopCloser(s2.NewReader(input)) 9425 default: 9426 return nil, fmt.Errorf("compression algorithm not known") 9427 } 9428 9429 // Decompress the block content. The checksum isn't compressed so 9430 // we can preserve it from the end of the block as-is. 9431 checksum := buf[bodyLen:] 9432 output, err := io.ReadAll(reader) 9433 if err != nil { 9434 return nil, fmt.Errorf("error reading compression reader: %w", err) 9435 } 9436 output = append(output, checksum...) 9437 9438 return output, reader.Close() 9439 }